Merge branch 'main' into nvidia-eval-integration

This commit is contained in:
Jash Gulabrai 2025-04-17 13:36:42 -04:00
commit 2117af25a7
27 changed files with 748 additions and 159 deletions

View file

@ -86,15 +86,15 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v4 uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v5 uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
with: with:
python-version: '3.10' python-version: '3.10'
- name: Install uv - name: Install uv
uses: astral-sh/setup-uv@v5 uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
with: with:
python-version: "3.10" python-version: "3.10"

View file

@ -1,9 +1,32 @@
document.addEventListener("DOMContentLoaded", function () { document.addEventListener("DOMContentLoaded", function () {
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches; const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
const htmlElement = document.documentElement; const htmlElement = document.documentElement;
if (prefersDark) {
htmlElement.setAttribute("data-theme", "dark"); // Check if theme is saved in localStorage
const savedTheme = localStorage.getItem("sphinx-rtd-theme");
if (savedTheme) {
// Use the saved theme preference
htmlElement.setAttribute("data-theme", savedTheme);
document.body.classList.toggle("dark", savedTheme === "dark");
} else { } else {
htmlElement.setAttribute("data-theme", "light"); // Fall back to system preference
const theme = prefersDark ? "dark" : "light";
htmlElement.setAttribute("data-theme", theme);
document.body.classList.toggle("dark", theme === "dark");
// Save initial preference
localStorage.setItem("sphinx-rtd-theme", theme);
} }
// Listen for theme changes from the existing toggle
const observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
if (mutation.attributeName === "data-theme") {
const currentTheme = htmlElement.getAttribute("data-theme");
localStorage.setItem("sphinx-rtd-theme", currentTheme);
}
});
});
observer.observe(htmlElement, { attributes: true });
}); });

View file

@ -5221,17 +5221,25 @@
"default": 10 "default": 10
}, },
"model": { "model": {
"type": "string" "type": "string",
"description": "The model identifier to use for the agent"
}, },
"instructions": { "instructions": {
"type": "string" "type": "string",
"description": "The system instructions for the agent"
},
"name": {
"type": "string",
"description": "Optional name for the agent, used in telemetry and identification"
}, },
"enable_session_persistence": { "enable_session_persistence": {
"type": "boolean", "type": "boolean",
"default": false "default": false,
"description": "Optional flag indicating whether session data has to be persisted"
}, },
"response_format": { "response_format": {
"$ref": "#/components/schemas/ResponseFormat" "$ref": "#/components/schemas/ResponseFormat",
"description": "Optional response format configuration"
} }
}, },
"additionalProperties": false, "additionalProperties": false,
@ -5239,7 +5247,8 @@
"model", "model",
"instructions" "instructions"
], ],
"title": "AgentConfig" "title": "AgentConfig",
"description": "Configuration for an agent."
}, },
"AgentTool": { "AgentTool": {
"oneOf": [ "oneOf": [
@ -8891,8 +8900,7 @@
}, },
"additionalProperties": false, "additionalProperties": false,
"required": [ "required": [
"role", "role"
"content"
], ],
"title": "OpenAIAssistantMessageParam", "title": "OpenAIAssistantMessageParam",
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request." "description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."

View file

@ -3686,18 +3686,29 @@ components:
default: 10 default: 10
model: model:
type: string type: string
description: >-
The model identifier to use for the agent
instructions: instructions:
type: string type: string
description: The system instructions for the agent
name:
type: string
description: >-
Optional name for the agent, used in telemetry and identification
enable_session_persistence: enable_session_persistence:
type: boolean type: boolean
default: false default: false
description: >-
Optional flag indicating whether session data has to be persisted
response_format: response_format:
$ref: '#/components/schemas/ResponseFormat' $ref: '#/components/schemas/ResponseFormat'
description: Optional response format configuration
additionalProperties: false additionalProperties: false
required: required:
- model - model
- instructions - instructions
title: AgentConfig title: AgentConfig
description: Configuration for an agent.
AgentTool: AgentTool:
oneOf: oneOf:
- type: string - type: string
@ -6097,7 +6108,6 @@ components:
additionalProperties: false additionalProperties: false
required: required:
- role - role
- content
title: OpenAIAssistantMessageParam title: OpenAIAssistantMessageParam
description: >- description: >-
A message containing the model's (assistant) response in an OpenAI-compatible A message containing the model's (assistant) response in an OpenAI-compatible

View file

@ -24,7 +24,7 @@ The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlama
Add the following dependency in your `build.gradle.kts` file: Add the following dependency in your `build.gradle.kts` file:
``` ```
dependencies { dependencies {
implementation("com.llama.llamastack:llama-stack-client-kotlin:0.1.4.2") implementation("com.llama.llamastack:llama-stack-client-kotlin:0.2.2")
} }
``` ```
This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/` This will download jar files in your gradle cache in a directory like `~/.gradle/caches/modules-2/files-2.1/com.llama.llamastack/`
@ -37,11 +37,7 @@ For local inferencing, it is required to include the ExecuTorch library into you
Include the ExecuTorch library by: Include the ExecuTorch library by:
1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine. 1. Download the `download-prebuilt-et-lib.sh` script file from the [llama-stack-client-kotlin-client-local](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/llama-stack-client-kotlin-client-local/download-prebuilt-et-lib.sh) directory to your local machine.
2. Move the script to the top level of your Android app where the app directory resides: 2. Move the script to the top level of your Android app where the `app` directory resides.
<p align="center">
<img src="https://github.com/meta-llama/llama-stack-client-kotlin/blob/latest-release/doc/img/example_android_app_directory.png" style="width:300px">
</p>
3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate. 3. Run `sh download-prebuilt-et-lib.sh` to create an `app/libs` directory and download the `executorch.aar` in that path. This generates an ExecuTorch library for the XNNPACK delegate.
4. Add the `executorch.aar` dependency in your `build.gradle.kts` file: 4. Add the `executorch.aar` dependency in your `build.gradle.kts` file:
``` ```
@ -52,6 +48,8 @@ dependencies {
} }
``` ```
See other dependencies for the local RAG in Android app [README](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#quick-start).
## Llama Stack APIs in Your Android App ## Llama Stack APIs in Your Android App
Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library. Breaking down the demo app, this section will show the core pieces that are used to initialize and run inference with Llama Stack using the Kotlin library.
@ -60,7 +58,7 @@ Start a Llama Stack server on localhost. Here is an example of how you can do th
``` ```
conda create -n stack-fireworks python=3.10 conda create -n stack-fireworks python=3.10
conda activate stack-fireworks conda activate stack-fireworks
pip install --no-cache llama-stack==0.1.4 pip install --no-cache llama-stack==0.2.2
llama stack build --template fireworks --image-type conda llama stack build --template fireworks --image-type conda
export FIREWORKS_API_KEY=<SOME_KEY> export FIREWORKS_API_KEY=<SOME_KEY>
llama stack run fireworks --port 5050 llama stack run fireworks --port 5050

View file

@ -1,89 +0,0 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# NVIDIA Distribution
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `inline::localfs` |
| eval | `remote::nvidia` |
| inference | `remote::nvidia` |
| post_training | `remote::nvidia` |
| safety | `remote::nvidia` |
| scoring | `inline::basic` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `inline::rag-runtime` |
| vector_io | `inline::faiss` |
### Environment Variables
The following environment variables can be configured:
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
### Models
The following models are available by default:
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `
### Prerequisite: API Keys
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
## Running Llama Stack with NVIDIA
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=8321
docker run \
-it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-nvidia \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
```
### Via Conda
```bash
llama stack build --template nvidia --image-type conda
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
--env INFERENCE_MODEL=$INFERENCE_MODEL
```

View file

@ -46,20 +46,91 @@ The following models are available by default:
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)` - `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` - `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` - `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `nvidia/llama-3.2-nv-embedqa-1b-v2 ` - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 ` - `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 ` - `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l ` - `snowflake/arctic-embed-l `
### Prerequisite: API Keys ## Prerequisites
### NVIDIA API Keys
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
### Deploy NeMo Microservices Platform
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
## Supported Services
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
### Inference: NVIDIA NIM
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
### Datasetio API: NeMo Data Store
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
### Eval API: NeMo Evaluator
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
### Post-Training API: NeMo Customizer
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
### Safety API: NeMo Guardrails
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
See the NVIDIA Safety docs for supported features and example usage.
## Deploying models
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
```sh
# URL to NeMo NIM Proxy service
export NEMO_URL="http://nemo.test"
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"name": "llama-3.2-1b-instruct",
"namespace": "meta",
"config": {
"model": "meta/llama-3.2-1b-instruct",
"nim_deployment": {
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.3",
"pvc_size": "25Gi",
"gpu": 1,
"additional_envs": {
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
}
}
}
}'
```
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
You can also remove a deployed NIM to free up GPU resources, if needed.
```sh
export NEMO_URL="http://nemo.test"
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
```
## Running Llama Stack with NVIDIA ## Running Llama Stack with NVIDIA
You can do this via Conda (build code) or Docker which has a pre-built image. You can do this via Conda or venv (build code), or Docker which has a pre-built image.
### Via Docker ### Via Docker
@ -81,9 +152,27 @@ docker run \
### Via Conda ### Via Conda
```bash ```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
llama stack build --template nvidia --image-type conda llama stack build --template nvidia --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 8321 \ --port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL --env INFERENCE_MODEL=$INFERENCE_MODEL
``` ```
### Via venv
If you've set up your local development environment, you can also build the image using your local virtual environment.
```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
llama stack build --template nvidia --image-type venv
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL
```
### Example Notebooks
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.

View file

@ -41,7 +41,7 @@ The following environment variables can be configured:
## Setting up vLLM server ## Setting up vLLM server
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and [supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
that we only use GPUs here for demonstration purposes. that we only use GPUs here for demonstration purposes.
@ -162,6 +162,55 @@ docker run \
--port $SAFETY_PORT --port $SAFETY_PORT
``` ```
### Setting up vLLM server on Intel GPU
Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
```bash
export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
export ZE_AFFINITY_MASK=0
docker run \
--pull always \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-p $INFERENCE_PORT:$INFERENCE_PORT \
--ipc=host \
intel/vllm:xpu \
--gpu-memory-utilization 0.7 \
--model $INFERENCE_MODEL \
--port $INFERENCE_PORT
```
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
```bash
export SAFETY_PORT=8081
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export ZE_AFFINITY_MASK=1
docker run \
--pull always \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-p $SAFETY_PORT:$SAFETY_PORT \
--ipc=host \
intel/vllm:xpu \
--gpu-memory-utilization 0.7 \
--model $SAFETY_MODEL \
--port $SAFETY_PORT
```
## Running Llama Stack ## Running Llama Stack
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.

View file

@ -225,8 +225,18 @@ class AgentConfigCommon(BaseModel):
@json_schema_type @json_schema_type
class AgentConfig(AgentConfigCommon): class AgentConfig(AgentConfigCommon):
"""Configuration for an agent.
:param model: The model identifier to use for the agent
:param instructions: The system instructions for the agent
:param name: Optional name for the agent, used in telemetry and identification
:param enable_session_persistence: Optional flag indicating whether session data has to be persisted
:param response_format: Optional response format configuration
"""
model: str model: str
instructions: str instructions: str
name: Optional[str] = None
enable_session_persistence: Optional[bool] = False enable_session_persistence: Optional[bool] = False
response_format: Optional[ResponseFormat] = None response_format: Optional[ResponseFormat] = None

View file

@ -526,9 +526,9 @@ class OpenAIAssistantMessageParam(BaseModel):
""" """
role: Literal["assistant"] = "assistant" role: Literal["assistant"] = "assistant"
content: OpenAIChatCompletionMessageContent content: Optional[OpenAIChatCompletionMessageContent] = None
name: Optional[str] = None name: Optional[str] = None
tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list) tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None
@json_schema_type @json_schema_type

View file

@ -235,10 +235,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
) )
except (Exception, RuntimeError) as exc: except (Exception, RuntimeError) as exc:
import traceback
cprint( cprint(
f"Error building stack: {exc}", f"Error building stack: {exc}",
color="red", color="red",
) )
cprint("Stack trace:", color="red")
traceback.print_exc()
sys.exit(1) sys.exit(1)
if run_config is None: if run_config is None:
cprint( cprint(
@ -350,7 +354,7 @@ def _run_stack_build_command_from_build_config(
build_config, build_config,
build_file_path, build_file_path,
image_name, image_name,
template_or_config=template_name or config_path, template_or_config=template_name or config_path or str(build_file_path),
) )
if return_code != 0: if return_code != 0:
raise RuntimeError(f"Failed to build image {image_name}") raise RuntimeError(f"Failed to build image {image_name}")

View file

@ -37,6 +37,17 @@ def tool_chat_page():
label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent
) )
if "builtin::rag" in toolgroup_selection:
vector_dbs = llama_stack_api.client.vector_dbs.list() or []
if not vector_dbs:
st.info("No vector databases available for selection.")
vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
selected_vector_dbs = st.multiselect(
label="Select Document Collections to use in RAG queries",
options=vector_dbs,
on_change=reset_agent,
)
st.subheader("MCP Servers") st.subheader("MCP Servers")
mcp_selection = st.pills( mcp_selection = st.pills(
label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent
@ -56,6 +67,27 @@ def tool_chat_page():
st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}") st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}")
st.json(active_tool_list) st.json(active_tool_list)
st.subheader("Chat Configurations")
max_tokens = st.slider(
"Max Tokens",
min_value=0,
max_value=4096,
value=512,
step=1,
help="The maximum number of tokens to generate",
on_change=reset_agent,
)
for i, tool_name in enumerate(toolgroup_selection):
if tool_name == "builtin::rag":
tool_dict = dict(
name="builtin::rag",
args={
"vector_db_ids": list(selected_vector_dbs),
},
)
toolgroup_selection[i] = tool_dict
@st.cache_resource @st.cache_resource
def create_agent(): def create_agent():
return Agent( return Agent(
@ -63,9 +95,7 @@ def tool_chat_page():
model=model, model=model,
instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.", instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.",
tools=toolgroup_selection, tools=toolgroup_selection,
sampling_params={ sampling_params={"strategy": {"type": "greedy"}, "max_tokens": max_tokens},
"strategy": {"type": "greedy"},
},
) )
agent = create_agent() agent = create_agent()

View file

@ -178,6 +178,8 @@ class ChatAgent(ShieldRunnerMixin):
span.set_attribute("request", request.model_dump_json()) span.set_attribute("request", request.model_dump_json())
turn_id = str(uuid.uuid4()) turn_id = str(uuid.uuid4())
span.set_attribute("turn_id", turn_id) span.set_attribute("turn_id", turn_id)
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
await self._initialize_tools(request.toolgroups) await self._initialize_tools(request.toolgroups)
async for chunk in self._run_turn(request, turn_id): async for chunk in self._run_turn(request, turn_id):
@ -190,6 +192,8 @@ class ChatAgent(ShieldRunnerMixin):
span.set_attribute("session_id", request.session_id) span.set_attribute("session_id", request.session_id)
span.set_attribute("request", request.model_dump_json()) span.set_attribute("request", request.model_dump_json())
span.set_attribute("turn_id", request.turn_id) span.set_attribute("turn_id", request.turn_id)
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
await self._initialize_tools() await self._initialize_tools()
async for chunk in self._run_turn(request): async for chunk in self._run_turn(request):
@ -498,6 +502,8 @@ class ChatAgent(ShieldRunnerMixin):
stop_reason = None stop_reason = None
async with tracing.span("inference") as span: async with tracing.span("inference") as span:
if self.agent_config.name:
span.set_attribute("agent_name", self.agent_config.name)
async for chunk in await self.inference_api.chat_completion( async for chunk in await self.inference_api.chat_completion(
self.agent_config.model, self.agent_config.model,
input_messages, input_messages,

View file

@ -0,0 +1,85 @@
# NVIDIA Inference Provider for LlamaStack
This provider enables running inference using NVIDIA NIM.
## Features
- Endpoints for completions, chat completions, and embeddings for registered models
## Getting Started
### Prerequisites
- LlamaStack with NVIDIA configuration
- Access to NVIDIA NIM deployment
- NIM for model to use for inference is deployed
### Setup
Build the NVIDIA environment:
```bash
llama stack build --template nvidia --image-type conda
```
### Basic Usage using the LlamaStack Python Client
#### Initialize the client
```python
import os
os.environ["NVIDIA_API_KEY"] = (
"" # Required if using hosted NIM endpoint. If self-hosted, not required.
)
os.environ["NVIDIA_BASE_URL"] = "http://nim.test" # NIM URL
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient("nvidia")
client.initialize()
```
### Create Completion
```python
response = client.completion(
model_id="meta-llama/Llama-3.1-8b-Instruct",
content="Complete the sentence using one word: Roses are red, violets are :",
stream=False,
sampling_params={
"max_tokens": 50,
},
)
print(f"Response: {response.content}")
```
### Create Chat Completion
```python
response = client.chat_completion(
model_id="meta-llama/Llama-3.1-8b-Instruct",
messages=[
{
"role": "system",
"content": "You must respond to each message with only one word",
},
{
"role": "user",
"content": "Complete the sentence using one word: Roses are red, violets are:",
},
],
stream=False,
sampling_params={
"max_tokens": 50,
},
)
print(f"Response: {response.completion_message.content}")
```
### Create Embeddings
```python
response = client.embeddings(
model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
)
print(f"Embeddings: {response.embeddings}")
```

View file

@ -48,6 +48,10 @@ MODEL_ENTRIES = [
"meta/llama-3.2-90b-vision-instruct", "meta/llama-3.2-90b-vision-instruct",
CoreModelId.llama3_2_90b_vision_instruct.value, CoreModelId.llama3_2_90b_vision_instruct.value,
), ),
build_hf_repo_model_entry(
"meta/llama-3.3-70b-instruct",
CoreModelId.llama3_3_70b_instruct.value,
),
# NeMo Retriever Text Embedding models - # NeMo Retriever Text Embedding models -
# #
# https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html # https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html

View file

@ -374,7 +374,8 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
options["max_tokens"] = self.config.max_tokens options["max_tokens"] = self.config.max_tokens
input_dict: dict[str, Any] = {} input_dict: dict[str, Any] = {}
if isinstance(request, ChatCompletionRequest) and request.tools is not None: # Only include the 'tools' param if there is any. It can break things if an empty list is sent to the vLLM.
if isinstance(request, ChatCompletionRequest) and request.tools:
input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)} input_dict = {"tools": _convert_to_vllm_tools_in_request(request.tools)}
if isinstance(request, ChatCompletionRequest): if isinstance(request, ChatCompletionRequest):

View file

@ -16,7 +16,11 @@ _MODEL_ENTRIES = [
build_hf_repo_model_entry( build_hf_repo_model_entry(
"meta/llama-3.1-8b-instruct", "meta/llama-3.1-8b-instruct",
CoreModelId.llama3_1_8b_instruct.value, CoreModelId.llama3_1_8b_instruct.value,
) ),
build_hf_repo_model_entry(
"meta/llama-3.2-1b-instruct",
CoreModelId.llama3_2_1b_instruct.value,
),
] ]

View file

@ -27,11 +27,12 @@ from .models import _MODEL_ENTRIES
# Map API status to JobStatus enum # Map API status to JobStatus enum
STATUS_MAPPING = { STATUS_MAPPING = {
"running": "in_progress", "running": JobStatus.in_progress.value,
"completed": "completed", "completed": JobStatus.completed.value,
"failed": "failed", "failed": JobStatus.failed.value,
"cancelled": "cancelled", "cancelled": JobStatus.cancelled.value,
"pending": "scheduled", "pending": JobStatus.scheduled.value,
"unknown": JobStatus.scheduled.value,
} }

View file

@ -0,0 +1,77 @@
# NVIDIA Safety Provider for LlamaStack
This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
## Features
- Run safety checks for messages
## Getting Started
### Prerequisites
- LlamaStack with NVIDIA configuration
- Access to NVIDIA NeMo Guardrails service
- NIM for model to use for safety check is deployed
### Setup
Build the NVIDIA environment:
```bash
llama stack build --template nvidia --image-type conda
```
### Basic Usage using the LlamaStack Python Client
#### Initialize the client
```python
import os
os.environ["NVIDIA_API_KEY"] = "your-api-key"
os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient("nvidia")
client.initialize()
```
#### Create a safety shield
```python
from llama_stack.apis.safety import Shield
from llama_stack.apis.inference import Message
# Create a safety shield
shield = Shield(
shield_id="your-shield-id",
provider_resource_id="safety-model-id", # The model to use for safety checks
description="Safety checks for content moderation",
)
# Register the shield
await client.safety.register_shield(shield)
```
#### Run safety checks
```python
# Messages to check
messages = [Message(role="user", content="Your message to check")]
# Run safety check
response = await client.safety.run_shield(
shield_id="your-shield-id",
messages=messages,
)
# Check for violations
if response.violation:
print(f"Safety violation detected: {response.violation.user_message}")
print(f"Violation level: {response.violation.violation_level}")
print(f"Metadata: {response.violation.metadata}")
else:
print("No safety violations detected")
```

View file

@ -25,14 +25,84 @@ The following models are available by default:
{% endif %} {% endif %}
### Prerequisite: API Keys ## Prerequisites
### NVIDIA API Keys
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
### Deploy NeMo Microservices Platform
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
## Supported Services
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
### Inference: NVIDIA NIM
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
### Datasetio API: NeMo Data Store
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
### Eval API: NeMo Evaluator
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
### Post-Training API: NeMo Customizer
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
### Safety API: NeMo Guardrails
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
See the NVIDIA Safety docs for supported features and example usage.
## Deploying models
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
```sh
# URL to NeMo NIM Proxy service
export NEMO_URL="http://nemo.test"
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"name": "llama-3.2-1b-instruct",
"namespace": "meta",
"config": {
"model": "meta/llama-3.2-1b-instruct",
"nim_deployment": {
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.3",
"pvc_size": "25Gi",
"gpu": 1,
"additional_envs": {
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
}
}
}
}'
```
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
You can also remove a deployed NIM to free up GPU resources, if needed.
```sh
export NEMO_URL="http://nemo.test"
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
```
## Running Llama Stack with NVIDIA ## Running Llama Stack with NVIDIA
You can do this via Conda (build code) or Docker which has a pre-built image. You can do this via Conda or venv (build code), or Docker which has a pre-built image.
### Via Docker ### Via Docker
@ -54,9 +124,27 @@ docker run \
### Via Conda ### Via Conda
```bash ```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
llama stack build --template nvidia --image-type conda llama stack build --template nvidia --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 8321 \ --port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY --env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL --env INFERENCE_MODEL=$INFERENCE_MODEL
``` ```
### Via venv
If you've set up your local development environment, you can also build the image using your local virtual environment.
```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
llama stack build --template nvidia --image-type venv
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL
```
### Example Notebooks
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.

View file

@ -170,6 +170,16 @@ models:
provider_id: nvidia provider_id: nvidia
provider_model_id: meta/llama-3.2-90b-vision-instruct provider_model_id: meta/llama-3.2-90b-vision-instruct
model_type: llm model_type: llm
- metadata: {}
model_id: meta/llama-3.3-70b-instruct
provider_id: nvidia
provider_model_id: meta/llama-3.3-70b-instruct
model_type: llm
- metadata: {}
model_id: meta-llama/Llama-3.3-70B-Instruct
provider_id: nvidia
provider_model_id: meta/llama-3.3-70b-instruct
model_type: llm
- metadata: - metadata:
embedding_dimension: 2048 embedding_dimension: 2048
context_length: 8192 context_length: 8192

View file

@ -28,7 +28,7 @@ The following environment variables can be configured:
## Setting up vLLM server ## Setting up vLLM server
In the following sections, we'll use either AMD and NVIDIA GPUs to serve as hardware accelerators for the vLLM In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM
server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also
[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and [supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and
that we only use GPUs here for demonstration purposes. that we only use GPUs here for demonstration purposes.
@ -149,6 +149,55 @@ docker run \
--port $SAFETY_PORT --port $SAFETY_PORT
``` ```
### Setting up vLLM server on Intel GPU
Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend:
- [intel/vllm](https://hub.docker.com/r/intel/vllm)
Here is a sample script to start a vLLM server locally via Docker using Intel provided container:
```bash
export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct
export ZE_AFFINITY_MASK=0
docker run \
--pull always \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-p $INFERENCE_PORT:$INFERENCE_PORT \
--ipc=host \
intel/vllm:xpu \
--gpu-memory-utilization 0.7 \
--model $INFERENCE_MODEL \
--port $INFERENCE_PORT
```
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
```bash
export SAFETY_PORT=8081
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export ZE_AFFINITY_MASK=1
docker run \
--pull always \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
--env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \
-p $SAFETY_PORT:$SAFETY_PORT \
--ipc=host \
intel/vllm:xpu \
--gpu-memory-utilization 0.7 \
--model $SAFETY_MODEL \
--port $SAFETY_PORT
```
## Running Llama Stack ## Running Llama Stack
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.

View file

@ -115,6 +115,70 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
assert "I can't" in logs_str assert "I can't" in logs_str
def test_agent_name(llama_stack_client, text_model_id):
agent_name = f"test-agent-{uuid4()}"
try:
agent = Agent(
llama_stack_client,
model=text_model_id,
instructions="You are a helpful assistant",
name=agent_name,
)
except TypeError:
agent = Agent(
llama_stack_client,
model=text_model_id,
instructions="You are a helpful assistant",
)
return
session_id = agent.create_session(f"test-session-{uuid4()}")
agent.create_turn(
messages=[
{
"role": "user",
"content": "Give me a sentence that contains the word: hello",
}
],
session_id=session_id,
stream=False,
)
all_spans = []
for span in llama_stack_client.telemetry.query_spans(
attribute_filters=[
{"key": "session_id", "op": "eq", "value": session_id},
],
attributes_to_return=["input", "output", "agent_name", "agent_id", "session_id"],
):
all_spans.append(span.attributes)
agent_name_spans = []
for span in llama_stack_client.telemetry.query_spans(
attribute_filters=[],
attributes_to_return=["agent_name"],
):
if "agent_name" in span.attributes:
agent_name_spans.append(span.attributes)
agent_logs = []
for span in llama_stack_client.telemetry.query_spans(
attribute_filters=[
{"key": "agent_name", "op": "eq", "value": agent_name},
],
attributes_to_return=["input", "output", "agent_name"],
):
if "output" in span.attributes and span.attributes["output"] != "no shields":
agent_logs.append(span.attributes)
assert len(agent_logs) == 1
assert agent_logs[0]["agent_name"] == agent_name
assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
assert "hello" in agent_logs[0]["output"].lower()
def test_tool_config(llama_stack_client_with_mocked_inference, agent_config): def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
common_params = dict( common_params = dict(
model="meta-llama/Llama-3.2-3B-Instruct", model="meta-llama/Llama-3.2-3B-Instruct",

View file

@ -31,6 +31,7 @@ def data_url_from_file(file_path: str) -> str:
return data_url return data_url
@pytest.mark.skip(reason="flaky. Couldn't find 'llamastack/simpleqa' on the Hugging Face Hub")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"purpose, source, provider_id, limit", "purpose, source, provider_id, limit",
[ [

View file

@ -0,0 +1,38 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from pathlib import Path
from llama_stack.cli.stack._build import (
_run_stack_build_command_from_build_config,
)
from llama_stack.distribution.datatypes import BuildConfig, DistributionSpec
from llama_stack.distribution.utils.image_types import LlamaStackImageType
def test_container_build_passes_path(monkeypatch, tmp_path):
called_with = {}
def spy_build_image(cfg, build_file_path, image_name, template_or_config):
called_with["path"] = template_or_config
return 0
monkeypatch.setattr(
"llama_stack.cli.stack._build.build_image",
spy_build_image,
raising=True,
)
cfg = BuildConfig(
image_type=LlamaStackImageType.CONTAINER.value,
distribution_spec=DistributionSpec(providers={}, description=""),
)
_run_stack_build_command_from_build_config(cfg, image_name="dummy")
assert "path" in called_with
assert isinstance(called_with["path"], str)
assert Path(called_with["path"]).exists()

View file

@ -26,7 +26,12 @@ from openai.types.chat.chat_completion_chunk import (
) )
from openai.types.model import Model as OpenAIModel from openai.types.model import Model as OpenAIModel
from llama_stack.apis.inference import ToolChoice, ToolConfig from llama_stack.apis.inference import (
ChatCompletionRequest,
ToolChoice,
ToolConfig,
UserMessage,
)
from llama_stack.apis.models import Model from llama_stack.apis.models import Model
from llama_stack.models.llama.datatypes import StopReason from llama_stack.models.llama.datatypes import StopReason
from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig from llama_stack.providers.remote.inference.vllm.config import VLLMInferenceAdapterConfig
@ -232,3 +237,14 @@ def test_chat_completion_doesnt_block_event_loop(caplog):
# above. # above.
asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"] asyncio_warnings = [record.message for record in caplog.records if record.name == "asyncio"]
assert not asyncio_warnings assert not asyncio_warnings
@pytest.mark.asyncio
async def test_get_params_empty_tools(vllm_inference_adapter):
request = ChatCompletionRequest(
tools=[],
model="test_model",
messages=[UserMessage(content="test")],
)
params = await vllm_inference_adapter._get_params(request)
assert "tools" not in params

View file

@ -200,10 +200,21 @@ class TestNvidiaPostTraining(unittest.TestCase):
) )
def test_get_training_job_status(self): def test_get_training_job_status(self):
customizer_status_to_job_status = [
("running", "in_progress"),
("completed", "completed"),
("failed", "failed"),
("cancelled", "cancelled"),
("pending", "scheduled"),
("unknown", "scheduled"),
]
for customizer_status, expected_status in customizer_status_to_job_status:
with self.subTest(customizer_status=customizer_status, expected_status=expected_status):
self.mock_make_request.return_value = { self.mock_make_request.return_value = {
"created_at": "2024-12-09T04:06:28.580220", "created_at": "2024-12-09T04:06:28.580220",
"updated_at": "2024-12-09T04:21:19.852832", "updated_at": "2024-12-09T04:21:19.852832",
"status": "completed", "status": customizer_status,
"steps_completed": 1210, "steps_completed": 1210,
"epochs_completed": 2, "epochs_completed": 2,
"percentage_done": 100.0, "percentage_done": 100.0,
@ -217,7 +228,7 @@ class TestNvidiaPostTraining(unittest.TestCase):
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id)) status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
assert isinstance(status, NvidiaPostTrainingJobStatusResponse) assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
assert status.status.value == "completed" assert status.status.value == expected_status
assert status.steps_completed == 1210 assert status.steps_completed == 1210
assert status.epochs_completed == 2 assert status.epochs_completed == 2
assert status.percentage_done == 100.0 assert status.percentage_done == 100.0
@ -225,9 +236,11 @@ class TestNvidiaPostTraining(unittest.TestCase):
assert status.train_loss == 1.718016266822815 assert status.train_loss == 1.718016266822815
assert status.val_loss == 1.8661999702453613 assert status.val_loss == 1.8661999702453613
self.mock_make_request.assert_called_once()
self._assert_request( self._assert_request(
self.mock_make_request, "GET", f"/v1/customization/jobs/{job_id}/status", expected_params={"job_id": job_id} self.mock_make_request,
"GET",
f"/v1/customization/jobs/{job_id}/status",
expected_params={"job_id": job_id},
) )
def test_get_training_jobs(self): def test_get_training_jobs(self):