mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-03 09:21:45 +00:00
Merge branch 'main' into add-watsonx-inference-adapter
This commit is contained in:
commit
c407f3c340
28 changed files with 1786 additions and 1354 deletions
6
.github/workflows/providers-build.yml
vendored
6
.github/workflows/providers-build.yml
vendored
|
@ -86,15 +86,15 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
uses: actions/setup-python@8d9ed9ac5c53483de85588cdf95a591a75ab9f55 # v5.5.0
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
uses: astral-sh/setup-uv@0c5e2b8115b80b4c7c5ddf6ffdd634974642d182 # v5.4.1
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
|
|
22
docs/_static/llama-stack-spec.html
vendored
22
docs/_static/llama-stack-spec.html
vendored
|
@ -5221,17 +5221,25 @@
|
|||
"default": 10
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The model identifier to use for the agent"
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"description": "The system instructions for the agent"
|
||||
},
|
||||
"name": {
|
||||
"type": "string",
|
||||
"description": "Optional name for the agent, used in telemetry and identification"
|
||||
},
|
||||
"enable_session_persistence": {
|
||||
"type": "boolean",
|
||||
"default": false
|
||||
"default": false,
|
||||
"description": "Optional flag indicating whether session data has to be persisted"
|
||||
},
|
||||
"response_format": {
|
||||
"$ref": "#/components/schemas/ResponseFormat"
|
||||
"$ref": "#/components/schemas/ResponseFormat",
|
||||
"description": "Optional response format configuration"
|
||||
}
|
||||
},
|
||||
"additionalProperties": false,
|
||||
|
@ -5239,7 +5247,8 @@
|
|||
"model",
|
||||
"instructions"
|
||||
],
|
||||
"title": "AgentConfig"
|
||||
"title": "AgentConfig",
|
||||
"description": "Configuration for an agent."
|
||||
},
|
||||
"AgentTool": {
|
||||
"oneOf": [
|
||||
|
@ -8891,8 +8900,7 @@
|
|||
},
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"role",
|
||||
"content"
|
||||
"role"
|
||||
],
|
||||
"title": "OpenAIAssistantMessageParam",
|
||||
"description": "A message containing the model's (assistant) response in an OpenAI-compatible chat completion request."
|
||||
|
|
12
docs/_static/llama-stack-spec.yaml
vendored
12
docs/_static/llama-stack-spec.yaml
vendored
|
@ -3686,18 +3686,29 @@ components:
|
|||
default: 10
|
||||
model:
|
||||
type: string
|
||||
description: >-
|
||||
The model identifier to use for the agent
|
||||
instructions:
|
||||
type: string
|
||||
description: The system instructions for the agent
|
||||
name:
|
||||
type: string
|
||||
description: >-
|
||||
Optional name for the agent, used in telemetry and identification
|
||||
enable_session_persistence:
|
||||
type: boolean
|
||||
default: false
|
||||
description: >-
|
||||
Optional flag indicating whether session data has to be persisted
|
||||
response_format:
|
||||
$ref: '#/components/schemas/ResponseFormat'
|
||||
description: Optional response format configuration
|
||||
additionalProperties: false
|
||||
required:
|
||||
- model
|
||||
- instructions
|
||||
title: AgentConfig
|
||||
description: Configuration for an agent.
|
||||
AgentTool:
|
||||
oneOf:
|
||||
- type: string
|
||||
|
@ -6097,7 +6108,6 @@ components:
|
|||
additionalProperties: false
|
||||
required:
|
||||
- role
|
||||
- content
|
||||
title: OpenAIAssistantMessageParam
|
||||
description: >-
|
||||
A message containing the model's (assistant) response in an OpenAI-compatible
|
||||
|
|
|
@ -1,88 +0,0 @@
|
|||
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
|
||||
# NVIDIA Distribution
|
||||
|
||||
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| datasetio | `inline::localfs` |
|
||||
| eval | `inline::meta-reference` |
|
||||
| inference | `remote::nvidia` |
|
||||
| post_training | `remote::nvidia` |
|
||||
| safety | `remote::nvidia` |
|
||||
| scoring | `inline::basic` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
| tool_runtime | `inline::rag-runtime` |
|
||||
| vector_io | `inline::faiss` |
|
||||
|
||||
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
|
||||
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
|
||||
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
|
||||
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
|
||||
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
|
||||
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
|
||||
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
|
||||
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
|
||||
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
|
||||
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
|
||||
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
|
||||
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
|
||||
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
|
||||
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
|
||||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||
- `nvidia/nv-embedqa-e5-v5 `
|
||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||
- `snowflake/arctic-embed-l `
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=8321
|
||||
docker run \
|
||||
-it \
|
||||
--pull always \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-nvidia \
|
||||
--yaml-config /root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
|
@ -45,20 +45,91 @@ The following models are available by default:
|
|||
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
|
||||
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
|
||||
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
|
||||
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
|
||||
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
|
||||
- `nvidia/nv-embedqa-e5-v5 `
|
||||
- `nvidia/nv-embedqa-mistral-7b-v2 `
|
||||
- `snowflake/arctic-embed-l `
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
## Prerequisites
|
||||
### NVIDIA API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||
|
||||
### Deploy NeMo Microservices Platform
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
|
||||
## Supported Services
|
||||
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||
|
||||
### Inference: NVIDIA NIM
|
||||
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||
|
||||
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||
|
||||
### Datasetio API: NeMo Data Store
|
||||
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||
|
||||
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Eval API: NeMo Evaluator
|
||||
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Post-Training API: NeMo Customizer
|
||||
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Safety API: NeMo Guardrails
|
||||
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the NVIDIA Safety docs for supported features and example usage.
|
||||
|
||||
## Deploying models
|
||||
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||
|
||||
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||
```sh
|
||||
# URL to NeMo NIM Proxy service
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"name": "llama-3.2-1b-instruct",
|
||||
"namespace": "meta",
|
||||
"config": {
|
||||
"model": "meta/llama-3.2-1b-instruct",
|
||||
"nim_deployment": {
|
||||
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||
"image_tag": "1.8.3",
|
||||
"pvc_size": "25Gi",
|
||||
"gpu": 1,
|
||||
"additional_envs": {
|
||||
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
|
||||
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||
```sh
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||
```
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
|
@ -80,9 +151,27 @@ docker run \
|
|||
### Via Conda
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Via venv
|
||||
|
||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type venv
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Example Notebooks
|
||||
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
|
||||
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
|
||||
|
|
|
@ -225,8 +225,18 @@ class AgentConfigCommon(BaseModel):
|
|||
|
||||
@json_schema_type
|
||||
class AgentConfig(AgentConfigCommon):
|
||||
"""Configuration for an agent.
|
||||
|
||||
:param model: The model identifier to use for the agent
|
||||
:param instructions: The system instructions for the agent
|
||||
:param name: Optional name for the agent, used in telemetry and identification
|
||||
:param enable_session_persistence: Optional flag indicating whether session data has to be persisted
|
||||
:param response_format: Optional response format configuration
|
||||
"""
|
||||
|
||||
model: str
|
||||
instructions: str
|
||||
name: Optional[str] = None
|
||||
enable_session_persistence: Optional[bool] = False
|
||||
response_format: Optional[ResponseFormat] = None
|
||||
|
||||
|
|
|
@ -526,9 +526,9 @@ class OpenAIAssistantMessageParam(BaseModel):
|
|||
"""
|
||||
|
||||
role: Literal["assistant"] = "assistant"
|
||||
content: OpenAIChatCompletionMessageContent
|
||||
content: Optional[OpenAIChatCompletionMessageContent] = None
|
||||
name: Optional[str] = None
|
||||
tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = Field(default_factory=list)
|
||||
tool_calls: Optional[List[OpenAIChatCompletionToolCall]] = None
|
||||
|
||||
|
||||
@json_schema_type
|
||||
|
|
|
@ -235,10 +235,14 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
|
|||
)
|
||||
|
||||
except (Exception, RuntimeError) as exc:
|
||||
import traceback
|
||||
|
||||
cprint(
|
||||
f"Error building stack: {exc}",
|
||||
color="red",
|
||||
)
|
||||
cprint("Stack trace:", color="red")
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
if run_config is None:
|
||||
cprint(
|
||||
|
@ -350,7 +354,7 @@ def _run_stack_build_command_from_build_config(
|
|||
build_config,
|
||||
build_file_path,
|
||||
image_name,
|
||||
template_or_config=template_name or config_path,
|
||||
template_or_config=template_name or config_path or str(build_file_path),
|
||||
)
|
||||
if return_code != 0:
|
||||
raise RuntimeError(f"Failed to build image {image_name}")
|
||||
|
|
|
@ -37,6 +37,17 @@ def tool_chat_page():
|
|||
label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent
|
||||
)
|
||||
|
||||
if "builtin::rag" in toolgroup_selection:
|
||||
vector_dbs = llama_stack_api.client.vector_dbs.list() or []
|
||||
if not vector_dbs:
|
||||
st.info("No vector databases available for selection.")
|
||||
vector_dbs = [vector_db.identifier for vector_db in vector_dbs]
|
||||
selected_vector_dbs = st.multiselect(
|
||||
label="Select Document Collections to use in RAG queries",
|
||||
options=vector_dbs,
|
||||
on_change=reset_agent,
|
||||
)
|
||||
|
||||
st.subheader("MCP Servers")
|
||||
mcp_selection = st.pills(
|
||||
label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent
|
||||
|
@ -67,6 +78,16 @@ def tool_chat_page():
|
|||
on_change=reset_agent,
|
||||
)
|
||||
|
||||
for i, tool_name in enumerate(toolgroup_selection):
|
||||
if tool_name == "builtin::rag":
|
||||
tool_dict = dict(
|
||||
name="builtin::rag",
|
||||
args={
|
||||
"vector_db_ids": list(selected_vector_dbs),
|
||||
},
|
||||
)
|
||||
toolgroup_selection[i] = tool_dict
|
||||
|
||||
@st.cache_resource
|
||||
def create_agent():
|
||||
return Agent(
|
||||
|
|
|
@ -178,6 +178,8 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
span.set_attribute("request", request.model_dump_json())
|
||||
turn_id = str(uuid.uuid4())
|
||||
span.set_attribute("turn_id", turn_id)
|
||||
if self.agent_config.name:
|
||||
span.set_attribute("agent_name", self.agent_config.name)
|
||||
|
||||
await self._initialize_tools(request.toolgroups)
|
||||
async for chunk in self._run_turn(request, turn_id):
|
||||
|
@ -190,6 +192,8 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
span.set_attribute("session_id", request.session_id)
|
||||
span.set_attribute("request", request.model_dump_json())
|
||||
span.set_attribute("turn_id", request.turn_id)
|
||||
if self.agent_config.name:
|
||||
span.set_attribute("agent_name", self.agent_config.name)
|
||||
|
||||
await self._initialize_tools()
|
||||
async for chunk in self._run_turn(request):
|
||||
|
@ -498,6 +502,8 @@ class ChatAgent(ShieldRunnerMixin):
|
|||
stop_reason = None
|
||||
|
||||
async with tracing.span("inference") as span:
|
||||
if self.agent_config.name:
|
||||
span.set_attribute("agent_name", self.agent_config.name)
|
||||
async for chunk in await self.inference_api.chat_completion(
|
||||
self.agent_config.model,
|
||||
input_messages,
|
||||
|
|
85
llama_stack/providers/remote/inference/nvidia/NVIDIA.md
Normal file
85
llama_stack/providers/remote/inference/nvidia/NVIDIA.md
Normal file
|
@ -0,0 +1,85 @@
|
|||
# NVIDIA Inference Provider for LlamaStack
|
||||
|
||||
This provider enables running inference using NVIDIA NIM.
|
||||
|
||||
## Features
|
||||
- Endpoints for completions, chat completions, and embeddings for registered models
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- LlamaStack with NVIDIA configuration
|
||||
- Access to NVIDIA NIM deployment
|
||||
- NIM for model to use for inference is deployed
|
||||
|
||||
### Setup
|
||||
|
||||
Build the NVIDIA environment:
|
||||
|
||||
```bash
|
||||
llama stack build --template nvidia --image-type conda
|
||||
```
|
||||
|
||||
### Basic Usage using the LlamaStack Python Client
|
||||
|
||||
#### Initialize the client
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ["NVIDIA_API_KEY"] = (
|
||||
"" # Required if using hosted NIM endpoint. If self-hosted, not required.
|
||||
)
|
||||
os.environ["NVIDIA_BASE_URL"] = "http://nim.test" # NIM URL
|
||||
|
||||
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||
|
||||
client = LlamaStackAsLibraryClient("nvidia")
|
||||
client.initialize()
|
||||
```
|
||||
|
||||
### Create Completion
|
||||
|
||||
```python
|
||||
response = client.completion(
|
||||
model_id="meta-llama/Llama-3.1-8b-Instruct",
|
||||
content="Complete the sentence using one word: Roses are red, violets are :",
|
||||
stream=False,
|
||||
sampling_params={
|
||||
"max_tokens": 50,
|
||||
},
|
||||
)
|
||||
print(f"Response: {response.content}")
|
||||
```
|
||||
|
||||
### Create Chat Completion
|
||||
|
||||
```python
|
||||
response = client.chat_completion(
|
||||
model_id="meta-llama/Llama-3.1-8b-Instruct",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You must respond to each message with only one word",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Complete the sentence using one word: Roses are red, violets are:",
|
||||
},
|
||||
],
|
||||
stream=False,
|
||||
sampling_params={
|
||||
"max_tokens": 50,
|
||||
},
|
||||
)
|
||||
print(f"Response: {response.completion_message.content}")
|
||||
```
|
||||
|
||||
### Create Embeddings
|
||||
```python
|
||||
response = client.embeddings(
|
||||
model_id="meta-llama/Llama-3.1-8b-Instruct", contents=["foo", "bar", "baz"]
|
||||
)
|
||||
print(f"Embeddings: {response.embeddings}")
|
||||
```
|
|
@ -48,6 +48,10 @@ MODEL_ENTRIES = [
|
|||
"meta/llama-3.2-90b-vision-instruct",
|
||||
CoreModelId.llama3_2_90b_vision_instruct.value,
|
||||
),
|
||||
build_hf_repo_model_entry(
|
||||
"meta/llama-3.3-70b-instruct",
|
||||
CoreModelId.llama3_3_70b_instruct.value,
|
||||
),
|
||||
# NeMo Retriever Text Embedding models -
|
||||
#
|
||||
# https://docs.nvidia.com/nim/nemo-retriever/text-embedding/latest/support-matrix.html
|
||||
|
|
|
@ -27,11 +27,12 @@ from .models import _MODEL_ENTRIES
|
|||
|
||||
# Map API status to JobStatus enum
|
||||
STATUS_MAPPING = {
|
||||
"running": "in_progress",
|
||||
"completed": "completed",
|
||||
"failed": "failed",
|
||||
"cancelled": "cancelled",
|
||||
"pending": "scheduled",
|
||||
"running": JobStatus.in_progress.value,
|
||||
"completed": JobStatus.completed.value,
|
||||
"failed": JobStatus.failed.value,
|
||||
"cancelled": JobStatus.cancelled.value,
|
||||
"pending": JobStatus.scheduled.value,
|
||||
"unknown": JobStatus.scheduled.value,
|
||||
}
|
||||
|
||||
|
||||
|
|
77
llama_stack/providers/remote/safety/nvidia/README.md
Normal file
77
llama_stack/providers/remote/safety/nvidia/README.md
Normal file
|
@ -0,0 +1,77 @@
|
|||
# NVIDIA Safety Provider for LlamaStack
|
||||
|
||||
This provider enables safety checks and guardrails for LLM interactions using NVIDIA's NeMo Guardrails service.
|
||||
|
||||
## Features
|
||||
|
||||
- Run safety checks for messages
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- LlamaStack with NVIDIA configuration
|
||||
- Access to NVIDIA NeMo Guardrails service
|
||||
- NIM for model to use for safety check is deployed
|
||||
|
||||
### Setup
|
||||
|
||||
Build the NVIDIA environment:
|
||||
|
||||
```bash
|
||||
llama stack build --template nvidia --image-type conda
|
||||
```
|
||||
|
||||
### Basic Usage using the LlamaStack Python Client
|
||||
|
||||
#### Initialize the client
|
||||
|
||||
```python
|
||||
import os
|
||||
|
||||
os.environ["NVIDIA_API_KEY"] = "your-api-key"
|
||||
os.environ["NVIDIA_GUARDRAILS_URL"] = "http://guardrails.test"
|
||||
|
||||
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
|
||||
|
||||
client = LlamaStackAsLibraryClient("nvidia")
|
||||
client.initialize()
|
||||
```
|
||||
|
||||
#### Create a safety shield
|
||||
|
||||
```python
|
||||
from llama_stack.apis.safety import Shield
|
||||
from llama_stack.apis.inference import Message
|
||||
|
||||
# Create a safety shield
|
||||
shield = Shield(
|
||||
shield_id="your-shield-id",
|
||||
provider_resource_id="safety-model-id", # The model to use for safety checks
|
||||
description="Safety checks for content moderation",
|
||||
)
|
||||
|
||||
# Register the shield
|
||||
await client.safety.register_shield(shield)
|
||||
```
|
||||
|
||||
#### Run safety checks
|
||||
|
||||
```python
|
||||
# Messages to check
|
||||
messages = [Message(role="user", content="Your message to check")]
|
||||
|
||||
# Run safety check
|
||||
response = await client.safety.run_shield(
|
||||
shield_id="your-shield-id",
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
# Check for violations
|
||||
if response.violation:
|
||||
print(f"Safety violation detected: {response.violation.user_message}")
|
||||
print(f"Violation level: {response.violation.violation_level}")
|
||||
print(f"Metadata: {response.violation.metadata}")
|
||||
else:
|
||||
print("No safety violations detected")
|
||||
```
|
|
@ -25,14 +25,84 @@ The following models are available by default:
|
|||
{% endif %}
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
## Prerequisites
|
||||
### NVIDIA API Keys
|
||||
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/).
|
||||
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
||||
|
||||
### Deploy NeMo Microservices Platform
|
||||
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/set-up/deploy-as-platform/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
||||
|
||||
## Supported Services
|
||||
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
||||
|
||||
### Inference: NVIDIA NIM
|
||||
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
||||
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
||||
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
||||
|
||||
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
||||
|
||||
### Datasetio API: NeMo Data Store
|
||||
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
||||
|
||||
See the [NVIDIA Datasetio docs](/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Eval API: NeMo Evaluator
|
||||
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Eval docs](/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Post-Training API: NeMo Customizer
|
||||
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the [NVIDIA Post-Training docs](/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
||||
|
||||
### Safety API: NeMo Guardrails
|
||||
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
||||
|
||||
See the NVIDIA Safety docs for supported features and example usage.
|
||||
|
||||
## Deploying models
|
||||
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
||||
|
||||
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
||||
```sh
|
||||
# URL to NeMo NIM Proxy service
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"name": "llama-3.2-1b-instruct",
|
||||
"namespace": "meta",
|
||||
"config": {
|
||||
"model": "meta/llama-3.2-1b-instruct",
|
||||
"nim_deployment": {
|
||||
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
||||
"image_tag": "1.8.3",
|
||||
"pvc_size": "25Gi",
|
||||
"gpu": 1,
|
||||
"additional_envs": {
|
||||
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
||||
}
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/documentation/latest/nemo-microservices/latest-early_access/get-started/tutorials/deploy-nims.html#) for more information on how to deploy a NIM and verify it's available for inference.
|
||||
|
||||
You can also remove a deployed NIM to free up GPU resources, if needed.
|
||||
```sh
|
||||
export NEMO_URL="http://nemo.test"
|
||||
|
||||
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
||||
```
|
||||
|
||||
## Running Llama Stack with NVIDIA
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
|
@ -54,9 +124,27 @@ docker run \
|
|||
### Via Conda
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Via venv
|
||||
|
||||
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
||||
|
||||
```bash
|
||||
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
|
||||
llama stack build --template nvidia --image-type venv
|
||||
llama stack run ./run.yaml \
|
||||
--port 8321 \
|
||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
```
|
||||
|
||||
### Example Notebooks
|
||||
You can reference the Jupyter notebooks in `docs/notebooks/nvidia/` for example usage of these APIs.
|
||||
- [Llama_Stack_NVIDIA_E2E_Flow.ipynb](/docs/notebooks/nvidia/Llama_Stack_NVIDIA_E2E_Flow.ipynb) contains an end-to-end workflow for running inference, customizing, and evaluating models using your deployed NeMo Microservices platform.
|
||||
|
|
|
@ -59,7 +59,7 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
default_models = get_model_registry(available_models)
|
||||
return DistributionTemplate(
|
||||
name="nvidia",
|
||||
distro_type="remote_hosted",
|
||||
distro_type="self_hosted",
|
||||
description="Use NVIDIA NIM for running LLM inference and safety",
|
||||
container_image=None,
|
||||
template_path=Path(__file__).parent / "doc_template.md",
|
||||
|
|
|
@ -173,6 +173,16 @@ models:
|
|||
provider_id: nvidia
|
||||
provider_model_id: meta/llama-3.2-90b-vision-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta/llama-3.3-70b-instruct
|
||||
provider_id: nvidia
|
||||
provider_model_id: meta/llama-3.3-70b-instruct
|
||||
model_type: llm
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.3-70B-Instruct
|
||||
provider_id: nvidia
|
||||
provider_model_id: meta/llama-3.3-70b-instruct
|
||||
model_type: llm
|
||||
- metadata:
|
||||
embedding_dimension: 2048
|
||||
context_length: 8192
|
||||
|
|
|
@ -46,6 +46,7 @@ dev = [
|
|||
"pytest-asyncio",
|
||||
"pytest-cov",
|
||||
"pytest-html",
|
||||
"pytest-json-report",
|
||||
"nbval", # For notebook testing
|
||||
"black",
|
||||
"ruff",
|
||||
|
|
|
@ -115,6 +115,70 @@ def test_agent_simple(llama_stack_client_with_mocked_inference, agent_config):
|
|||
assert "I can't" in logs_str
|
||||
|
||||
|
||||
def test_agent_name(llama_stack_client, text_model_id):
|
||||
agent_name = f"test-agent-{uuid4()}"
|
||||
|
||||
try:
|
||||
agent = Agent(
|
||||
llama_stack_client,
|
||||
model=text_model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
name=agent_name,
|
||||
)
|
||||
except TypeError:
|
||||
agent = Agent(
|
||||
llama_stack_client,
|
||||
model=text_model_id,
|
||||
instructions="You are a helpful assistant",
|
||||
)
|
||||
return
|
||||
|
||||
session_id = agent.create_session(f"test-session-{uuid4()}")
|
||||
|
||||
agent.create_turn(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "Give me a sentence that contains the word: hello",
|
||||
}
|
||||
],
|
||||
session_id=session_id,
|
||||
stream=False,
|
||||
)
|
||||
|
||||
all_spans = []
|
||||
for span in llama_stack_client.telemetry.query_spans(
|
||||
attribute_filters=[
|
||||
{"key": "session_id", "op": "eq", "value": session_id},
|
||||
],
|
||||
attributes_to_return=["input", "output", "agent_name", "agent_id", "session_id"],
|
||||
):
|
||||
all_spans.append(span.attributes)
|
||||
|
||||
agent_name_spans = []
|
||||
for span in llama_stack_client.telemetry.query_spans(
|
||||
attribute_filters=[],
|
||||
attributes_to_return=["agent_name"],
|
||||
):
|
||||
if "agent_name" in span.attributes:
|
||||
agent_name_spans.append(span.attributes)
|
||||
|
||||
agent_logs = []
|
||||
for span in llama_stack_client.telemetry.query_spans(
|
||||
attribute_filters=[
|
||||
{"key": "agent_name", "op": "eq", "value": agent_name},
|
||||
],
|
||||
attributes_to_return=["input", "output", "agent_name"],
|
||||
):
|
||||
if "output" in span.attributes and span.attributes["output"] != "no shields":
|
||||
agent_logs.append(span.attributes)
|
||||
|
||||
assert len(agent_logs) == 1
|
||||
assert agent_logs[0]["agent_name"] == agent_name
|
||||
assert "Give me a sentence that contains the word: hello" in agent_logs[0]["input"]
|
||||
assert "hello" in agent_logs[0]["output"].lower()
|
||||
|
||||
|
||||
def test_tool_config(llama_stack_client_with_mocked_inference, agent_config):
|
||||
common_params = dict(
|
||||
model="meta-llama/Llama-3.2-3B-Instruct",
|
||||
|
|
38
tests/unit/distribution/test_build_path.py
Normal file
38
tests/unit/distribution/test_build_path.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.cli.stack._build import (
|
||||
_run_stack_build_command_from_build_config,
|
||||
)
|
||||
from llama_stack.distribution.datatypes import BuildConfig, DistributionSpec
|
||||
from llama_stack.distribution.utils.image_types import LlamaStackImageType
|
||||
|
||||
|
||||
def test_container_build_passes_path(monkeypatch, tmp_path):
|
||||
called_with = {}
|
||||
|
||||
def spy_build_image(cfg, build_file_path, image_name, template_or_config):
|
||||
called_with["path"] = template_or_config
|
||||
return 0
|
||||
|
||||
monkeypatch.setattr(
|
||||
"llama_stack.cli.stack._build.build_image",
|
||||
spy_build_image,
|
||||
raising=True,
|
||||
)
|
||||
|
||||
cfg = BuildConfig(
|
||||
image_type=LlamaStackImageType.CONTAINER.value,
|
||||
distribution_spec=DistributionSpec(providers={}, description=""),
|
||||
)
|
||||
|
||||
_run_stack_build_command_from_build_config(cfg, image_name="dummy")
|
||||
|
||||
assert "path" in called_with
|
||||
assert isinstance(called_with["path"], str)
|
||||
assert Path(called_with["path"]).exists()
|
|
@ -200,35 +200,48 @@ class TestNvidiaPostTraining(unittest.TestCase):
|
|||
)
|
||||
|
||||
def test_get_training_job_status(self):
|
||||
self.mock_make_request.return_value = {
|
||||
"created_at": "2024-12-09T04:06:28.580220",
|
||||
"updated_at": "2024-12-09T04:21:19.852832",
|
||||
"status": "completed",
|
||||
"steps_completed": 1210,
|
||||
"epochs_completed": 2,
|
||||
"percentage_done": 100.0,
|
||||
"best_epoch": 2,
|
||||
"train_loss": 1.718016266822815,
|
||||
"val_loss": 1.8661999702453613,
|
||||
}
|
||||
customizer_status_to_job_status = [
|
||||
("running", "in_progress"),
|
||||
("completed", "completed"),
|
||||
("failed", "failed"),
|
||||
("cancelled", "cancelled"),
|
||||
("pending", "scheduled"),
|
||||
("unknown", "scheduled"),
|
||||
]
|
||||
|
||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||
for customizer_status, expected_status in customizer_status_to_job_status:
|
||||
with self.subTest(customizer_status=customizer_status, expected_status=expected_status):
|
||||
self.mock_make_request.return_value = {
|
||||
"created_at": "2024-12-09T04:06:28.580220",
|
||||
"updated_at": "2024-12-09T04:21:19.852832",
|
||||
"status": customizer_status,
|
||||
"steps_completed": 1210,
|
||||
"epochs_completed": 2,
|
||||
"percentage_done": 100.0,
|
||||
"best_epoch": 2,
|
||||
"train_loss": 1.718016266822815,
|
||||
"val_loss": 1.8661999702453613,
|
||||
}
|
||||
|
||||
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
|
||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||
|
||||
assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
|
||||
assert status.status.value == "completed"
|
||||
assert status.steps_completed == 1210
|
||||
assert status.epochs_completed == 2
|
||||
assert status.percentage_done == 100.0
|
||||
assert status.best_epoch == 2
|
||||
assert status.train_loss == 1.718016266822815
|
||||
assert status.val_loss == 1.8661999702453613
|
||||
status = self.run_async(self.adapter.get_training_job_status(job_uuid=job_id))
|
||||
|
||||
self.mock_make_request.assert_called_once()
|
||||
self._assert_request(
|
||||
self.mock_make_request, "GET", f"/v1/customization/jobs/{job_id}/status", expected_params={"job_id": job_id}
|
||||
)
|
||||
assert isinstance(status, NvidiaPostTrainingJobStatusResponse)
|
||||
assert status.status.value == expected_status
|
||||
assert status.steps_completed == 1210
|
||||
assert status.epochs_completed == 2
|
||||
assert status.percentage_done == 100.0
|
||||
assert status.best_epoch == 2
|
||||
assert status.train_loss == 1.718016266822815
|
||||
assert status.val_loss == 1.8661999702453613
|
||||
|
||||
self._assert_request(
|
||||
self.mock_make_request,
|
||||
"GET",
|
||||
f"/v1/customization/jobs/{job_id}/status",
|
||||
expected_params={"job_id": job_id},
|
||||
)
|
||||
|
||||
def test_get_training_jobs(self):
|
||||
job_id = "cust-JGTaMbJMdqjJU8WbQdN9Q2"
|
||||
|
|
|
@ -8,29 +8,44 @@ This framework allows you to run the same set of verification tests against diff
|
|||
|
||||
## Features
|
||||
|
||||
The verification suite currently tests:
|
||||
The verification suite currently tests the following in both streaming and non-streaming modes:
|
||||
|
||||
- Basic chat completions (streaming and non-streaming)
|
||||
- Basic chat completions
|
||||
- Image input capabilities
|
||||
- Structured JSON output formatting
|
||||
- Tool calling functionality
|
||||
|
||||
## Report
|
||||
|
||||
The lastest report can be found at [REPORT.md](REPORT.md).
|
||||
|
||||
To update the report, ensure you have the API keys set,
|
||||
```bash
|
||||
export OPENAI_API_KEY=<your_openai_api_key>
|
||||
export FIREWORKS_API_KEY=<your_fireworks_api_key>
|
||||
export TOGETHER_API_KEY=<your_together_api_key>
|
||||
```
|
||||
then run
|
||||
```bash
|
||||
uv run --with-editable ".[dev]" python tests/verifications/generate_report.py --run-tests
|
||||
```
|
||||
|
||||
## Running Tests
|
||||
|
||||
To run the verification tests, use pytest with the following parameters:
|
||||
|
||||
```bash
|
||||
cd llama-stack
|
||||
pytest tests/verifications/openai --provider=<provider-name>
|
||||
pytest tests/verifications/openai_api --provider=<provider-name>
|
||||
```
|
||||
|
||||
Example:
|
||||
```bash
|
||||
# Run all tests
|
||||
pytest tests/verifications/openai --provider=together
|
||||
pytest tests/verifications/openai_api --provider=together
|
||||
|
||||
# Only run tests with Llama 4 models
|
||||
pytest tests/verifications/openai --provider=together -k 'Llama-4'
|
||||
pytest tests/verifications/openai_api --provider=together -k 'Llama-4'
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
@ -41,23 +56,22 @@ pytest tests/verifications/openai --provider=together -k 'Llama-4'
|
|||
|
||||
## Supported Providers
|
||||
|
||||
The verification suite currently supports:
|
||||
- OpenAI
|
||||
- Fireworks
|
||||
- Together
|
||||
- Groq
|
||||
- Cerebras
|
||||
The verification suite supports any provider with an OpenAI compatible endpoint.
|
||||
|
||||
See `tests/verifications/conf/` for the list of supported providers.
|
||||
|
||||
To run on a new provider, simply add a new yaml file to the `conf/` directory with the provider config. See `tests/verifications/conf/together.yaml` for an example.
|
||||
|
||||
## Adding New Test Cases
|
||||
|
||||
To add new test cases, create appropriate JSON files in the `openai/fixtures/test_cases/` directory following the existing patterns.
|
||||
To add new test cases, create appropriate JSON files in the `openai_api/fixtures/test_cases/` directory following the existing patterns.
|
||||
|
||||
|
||||
## Structure
|
||||
|
||||
- `__init__.py` - Marks the directory as a Python package
|
||||
- `conftest.py` - Global pytest configuration and fixtures
|
||||
- `openai/` - Tests specific to OpenAI-compatible APIs
|
||||
- `conf/` - Provider-specific configuration files
|
||||
- `openai_api/` - Tests specific to OpenAI-compatible APIs
|
||||
- `fixtures/` - Test fixtures and utilities
|
||||
- `fixtures.py` - Provider-specific fixtures
|
||||
- `load.py` - Utilities for loading test cases
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Test Results Report
|
||||
|
||||
*Generated on: 2025-04-14 18:11:37*
|
||||
*Generated on: 2025-04-16 15:10:57*
|
||||
|
||||
*This report was generated by running `python tests/verifications/generate_report.py`*
|
||||
|
||||
|
@ -15,7 +15,7 @@
|
|||
|
||||
| Provider | Pass Rate | Tests Passed | Total Tests |
|
||||
| --- | --- | --- | --- |
|
||||
| Together | 48.7% | 37 | 76 |
|
||||
| Together | 51.3% | 39 | 76 |
|
||||
| Fireworks | 47.4% | 36 | 76 |
|
||||
| Openai | 100.0% | 52 | 52 |
|
||||
|
||||
|
@ -23,7 +23,7 @@
|
|||
|
||||
## Together
|
||||
|
||||
*Tests run on: 2025-04-14 18:08:14*
|
||||
*Tests run on: 2025-04-16 15:03:51*
|
||||
|
||||
```bash
|
||||
# Run all tests for this provider:
|
||||
|
@ -49,8 +49,8 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe
|
|||
| test_chat_non_streaming_basic (saturn) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_image | ⚪ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (add_product_tool) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ❌ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ❌ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (compare_monthly_expense_tool) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (get_then_create_event_tool) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (text_then_weather_tool) | ❌ | ❌ | ❌ |
|
||||
| test_chat_non_streaming_multi_turn_tool_calling (weather_tool_then_text) | ✅ | ✅ | ✅ |
|
||||
| test_chat_non_streaming_structured_output (calendar) | ✅ | ✅ | ✅ |
|
||||
|
@ -74,7 +74,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=togethe
|
|||
|
||||
## Fireworks
|
||||
|
||||
*Tests run on: 2025-04-14 18:04:06*
|
||||
*Tests run on: 2025-04-16 15:05:54*
|
||||
|
||||
```bash
|
||||
# Run all tests for this provider:
|
||||
|
@ -125,7 +125,7 @@ pytest tests/verifications/openai_api/test_chat_completion.py --provider=firewor
|
|||
|
||||
## Openai
|
||||
|
||||
*Tests run on: 2025-04-14 18:09:51*
|
||||
*Tests run on: 2025-04-16 15:09:18*
|
||||
|
||||
```bash
|
||||
# Run all tests for this provider:
|
||||
|
|
|
@ -3,14 +3,6 @@
|
|||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
# /// script
|
||||
# requires-python = ">=3.10"
|
||||
# dependencies = [
|
||||
# "pytest-json-report",
|
||||
# "pyyaml",
|
||||
# ]
|
||||
# ///
|
||||
"""
|
||||
Test Report Generator
|
||||
|
||||
|
@ -67,16 +59,10 @@ RESULTS_DIR.mkdir(exist_ok=True)
|
|||
# Maximum number of test result files to keep per provider
|
||||
MAX_RESULTS_PER_PROVIDER = 1
|
||||
|
||||
PROVIDER_ORDER = [
|
||||
DEFAULT_PROVIDERS = [
|
||||
"together",
|
||||
"fireworks",
|
||||
"groq",
|
||||
"cerebras",
|
||||
"openai",
|
||||
"together-llama-stack",
|
||||
"fireworks-llama-stack",
|
||||
"groq-llama-stack",
|
||||
"openai-llama-stack",
|
||||
]
|
||||
|
||||
VERIFICATION_CONFIG = _load_all_verification_configs()
|
||||
|
@ -142,6 +128,14 @@ def run_tests(provider, keyword=None):
|
|||
return None
|
||||
|
||||
|
||||
def run_multiple_tests(providers_to_run: list[str], keyword: str | None):
|
||||
"""Runs tests for a list of providers."""
|
||||
print(f"Running tests for providers: {', '.join(providers_to_run)}")
|
||||
for provider in providers_to_run:
|
||||
run_tests(provider.strip(), keyword=keyword)
|
||||
print("Finished running tests.")
|
||||
|
||||
|
||||
def parse_results(
|
||||
result_file,
|
||||
) -> Tuple[DefaultDict[str, DefaultDict[str, Dict[str, bool]]], DefaultDict[str, Set[str]], Set[str], str]:
|
||||
|
@ -250,20 +244,6 @@ def parse_results(
|
|||
return parsed_results, providers_in_file, tests_in_file, run_timestamp_str
|
||||
|
||||
|
||||
def get_all_result_files_by_provider():
|
||||
"""Get all test result files, keyed by provider."""
|
||||
provider_results = {}
|
||||
|
||||
result_files = list(RESULTS_DIR.glob("*.json"))
|
||||
|
||||
for file in result_files:
|
||||
provider = file.stem
|
||||
if provider:
|
||||
provider_results[provider] = file
|
||||
|
||||
return provider_results
|
||||
|
||||
|
||||
def generate_report(
|
||||
results_dict: Dict[str, Any],
|
||||
providers: Dict[str, Set[str]],
|
||||
|
@ -276,6 +256,7 @@ def generate_report(
|
|||
Args:
|
||||
results_dict: Aggregated results [provider][model][test_name] -> status.
|
||||
providers: Dict of all providers and their models {provider: {models}}.
|
||||
The order of keys in this dict determines the report order.
|
||||
all_tests: Set of all test names found.
|
||||
provider_timestamps: Dict of provider to timestamp when tests were run
|
||||
output_file: Optional path to save the report.
|
||||
|
@ -353,22 +334,17 @@ def generate_report(
|
|||
passed_tests += 1
|
||||
provider_totals[provider] = (provider_passed, provider_total)
|
||||
|
||||
# Add summary table (use passed-in providers dict)
|
||||
# Add summary table (use the order from the providers dict keys)
|
||||
report.append("| Provider | Pass Rate | Tests Passed | Total Tests |")
|
||||
report.append("| --- | --- | --- | --- |")
|
||||
for provider in [p for p in PROVIDER_ORDER if p in providers]: # Check against keys of passed-in dict
|
||||
passed, total = provider_totals.get(provider, (0, 0))
|
||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||
for provider in [p for p in providers if p not in PROVIDER_ORDER]: # Check against keys of passed-in dict
|
||||
# Iterate through providers in the order they appear in the input dict
|
||||
for provider in providers_sorted.keys():
|
||||
passed, total = provider_totals.get(provider, (0, 0))
|
||||
pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A"
|
||||
report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |")
|
||||
report.append("\n")
|
||||
|
||||
for provider in sorted(
|
||||
providers_sorted.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p)
|
||||
):
|
||||
for provider in providers_sorted.keys():
|
||||
provider_models = providers_sorted[provider] # Use sorted models
|
||||
if not provider_models:
|
||||
continue
|
||||
|
@ -461,60 +437,62 @@ def main():
|
|||
"--providers",
|
||||
type=str,
|
||||
nargs="+",
|
||||
help="Specify providers to test (comma-separated or space-separated, default: all)",
|
||||
help="Specify providers to include/test (comma-separated or space-separated, default: uses DEFAULT_PROVIDERS)",
|
||||
)
|
||||
parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)")
|
||||
parser.add_argument("--k", type=str, help="Keyword expression to filter tests (passed to pytest -k)")
|
||||
args = parser.parse_args()
|
||||
|
||||
all_results = {}
|
||||
# Initialize collections to aggregate results in main
|
||||
aggregated_providers = defaultdict(set)
|
||||
final_providers_order = {} # Dictionary to store results, preserving processing order
|
||||
aggregated_tests = set()
|
||||
provider_timestamps = {}
|
||||
|
||||
if args.run_tests:
|
||||
# Get list of available providers from command line or use detected providers
|
||||
if args.providers:
|
||||
# Handle both comma-separated and space-separated lists
|
||||
test_providers = []
|
||||
for provider_arg in args.providers:
|
||||
# Split by comma if commas are present
|
||||
if "," in provider_arg:
|
||||
test_providers.extend(provider_arg.split(","))
|
||||
else:
|
||||
test_providers.append(provider_arg)
|
||||
else:
|
||||
# Default providers to test
|
||||
test_providers = PROVIDER_ORDER
|
||||
|
||||
for provider in test_providers:
|
||||
provider = provider.strip() # Remove any whitespace
|
||||
result_file = run_tests(provider, keyword=args.k)
|
||||
if result_file:
|
||||
# Parse and aggregate results
|
||||
parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
|
||||
all_results.update(parsed_results)
|
||||
for prov, models in providers_in_file.items():
|
||||
aggregated_providers[prov].update(models)
|
||||
if run_timestamp:
|
||||
provider_timestamps[prov] = run_timestamp
|
||||
aggregated_tests.update(tests_in_file)
|
||||
# 1. Determine the desired list and order of providers
|
||||
if args.providers:
|
||||
desired_providers = []
|
||||
for provider_arg in args.providers:
|
||||
desired_providers.extend([p.strip() for p in provider_arg.split(",")])
|
||||
else:
|
||||
# Use existing results
|
||||
provider_result_files = get_all_result_files_by_provider()
|
||||
desired_providers = DEFAULT_PROVIDERS # Use default order/list
|
||||
|
||||
for result_file in provider_result_files.values():
|
||||
# Parse and aggregate results
|
||||
parsed_results, providers_in_file, tests_in_file, run_timestamp = parse_results(result_file)
|
||||
all_results.update(parsed_results)
|
||||
for prov, models in providers_in_file.items():
|
||||
aggregated_providers[prov].update(models)
|
||||
if run_timestamp:
|
||||
provider_timestamps[prov] = run_timestamp
|
||||
aggregated_tests.update(tests_in_file)
|
||||
# 2. Run tests if requested (using the desired provider list)
|
||||
if args.run_tests:
|
||||
run_multiple_tests(desired_providers, args.k)
|
||||
|
||||
generate_report(all_results, aggregated_providers, aggregated_tests, provider_timestamps, args.output)
|
||||
for provider in desired_providers:
|
||||
# Construct the expected result file path directly
|
||||
result_file = RESULTS_DIR / f"{provider}.json"
|
||||
|
||||
if result_file.exists(): # Check if the specific file exists
|
||||
print(f"Loading results for {provider} from {result_file}")
|
||||
try:
|
||||
parsed_data = parse_results(result_file)
|
||||
parsed_results, providers_in_file, tests_in_file, run_timestamp = parsed_data
|
||||
all_results.update(parsed_results)
|
||||
aggregated_tests.update(tests_in_file)
|
||||
|
||||
# Add models for this provider, ensuring it's added in the correct report order
|
||||
if provider in providers_in_file:
|
||||
if provider not in final_providers_order:
|
||||
final_providers_order[provider] = set()
|
||||
final_providers_order[provider].update(providers_in_file[provider])
|
||||
if run_timestamp != "Unknown":
|
||||
provider_timestamps[provider] = run_timestamp
|
||||
else:
|
||||
print(
|
||||
f"Warning: Provider '{provider}' found in desired list but not within its result file data ({result_file})."
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error parsing results for provider {provider} from {result_file}: {e}")
|
||||
else:
|
||||
# Only print warning if we expected results (i.e., provider was in the desired list)
|
||||
print(f"Result file for desired provider '{provider}' not found at {result_file}. Skipping.")
|
||||
|
||||
# 5. Generate the report using the filtered & ordered results
|
||||
print(f"Final Provider Order for Report: {list(final_providers_order.keys())}")
|
||||
generate_report(all_results, final_providers_order, aggregated_tests, provider_timestamps, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load diff
File diff suppressed because one or more lines are too long
17
uv.lock
generated
17
uv.lock
generated
|
@ -1,4 +1,5 @@
|
|||
version = 1
|
||||
revision = 1
|
||||
requires-python = ">=3.10"
|
||||
resolution-markers = [
|
||||
"(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')",
|
||||
|
@ -1410,6 +1411,7 @@ dev = [
|
|||
{ name = "pytest-asyncio" },
|
||||
{ name = "pytest-cov" },
|
||||
{ name = "pytest-html" },
|
||||
{ name = "pytest-json-report" },
|
||||
{ name = "ruamel-yaml" },
|
||||
{ name = "ruff" },
|
||||
{ name = "types-requests" },
|
||||
|
@ -1502,6 +1504,7 @@ requires-dist = [
|
|||
{ name = "pytest-asyncio", marker = "extra == 'dev'" },
|
||||
{ name = "pytest-cov", marker = "extra == 'dev'" },
|
||||
{ name = "pytest-html", marker = "extra == 'dev'" },
|
||||
{ name = "pytest-json-report", marker = "extra == 'dev'" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "qdrant-client", marker = "extra == 'unit'" },
|
||||
{ name = "requests" },
|
||||
|
@ -1531,6 +1534,7 @@ requires-dist = [
|
|||
{ name = "types-setuptools", marker = "extra == 'dev'" },
|
||||
{ name = "uvicorn", marker = "extra == 'dev'" },
|
||||
]
|
||||
provides-extras = ["dev", "unit", "test", "docs", "codegen", "ui"]
|
||||
|
||||
[[package]]
|
||||
name = "llama-stack-client"
|
||||
|
@ -2740,6 +2744,19 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/c8/c7/c160021cbecd956cc1a6f79e5fe155f7868b2e5b848f1320dad0b3e3122f/pytest_html-4.1.1-py3-none-any.whl", hash = "sha256:c8152cea03bd4e9bee6d525573b67bbc6622967b72b9628dda0ea3e2a0b5dd71", size = 23491 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-json-report"
|
||||
version = "1.5.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pytest" },
|
||||
{ name = "pytest-metadata" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/4f/d3/765dae9712fcd68d820338908c1337e077d5fdadccd5cacf95b9b0bea278/pytest-json-report-1.5.0.tar.gz", hash = "sha256:2dde3c647851a19b5f3700729e8310a6e66efb2077d674f27ddea3d34dc615de", size = 21241 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/81/35/d07400c715bf8a88aa0c1ee9c9eb6050ca7fe5b39981f0eea773feeb0681/pytest_json_report-1.5.0-py3-none-any.whl", hash = "sha256:9897b68c910b12a2e48dd849f9a284b2c79a732a8a9cb398452ddd23d3c8c325", size = 13222 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-metadata"
|
||||
version = "3.1.1"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue