From 378f0de439917bc76d41e216a98d082bc02447aa Mon Sep 17 00:00:00 2001 From: ehhuang Date: Sun, 6 Apr 2025 18:51:34 -0700 Subject: [PATCH 01/24] docs: llama4 getting started nb (#1878) # What does this PR do? ## Test Plan --- docs/getting_started_llama4.ipynb | 876 ++++++++++++++++++++++++++++++ docs/source/index.md | 5 + 2 files changed, 881 insertions(+) create mode 100644 docs/getting_started_llama4.ipynb diff --git a/docs/getting_started_llama4.ipynb b/docs/getting_started_llama4.ipynb new file mode 100644 index 000000000..d489b5d06 --- /dev/null +++ b/docs/getting_started_llama4.ipynb @@ -0,0 +1,876 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c1e7571c", + "metadata": { + "id": "c1e7571c" + }, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n", + "\n", + "# Getting Started with Llama 4 in Llama Stack\n", + "\n", + "\"drawing\"\n", + "\n", + "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n", + "\n", + "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n", + "\n", + "In this guide, we will showcase how you can get started with using Llama 4 in Llama Stack.\n" + ] + }, + { + "cell_type": "markdown", + "id": "4CV1Q19BDMVw", + "metadata": { + "id": "4CV1Q19BDMVw" + }, + "source": [ + "## 1. Getting started with Llama Stack" + ] + }, + { + "cell_type": "markdown", + "id": "K4AvfUAJZOeS", + "metadata": { + "id": "K4AvfUAJZOeS" + }, + "source": [ + "### 1.1. Download Llama 4 Model\n", + "\n", + "In this showcase, we will use run Llama 4 locally. Note you need 8xH100 GPU-host to run these models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fb2e8b6", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install uv \n", + "\n", + "MODEL=\"Llama-4-Scout-17B-16E-Instruct\"\n", + "# get meta url from llama.com\n", + "!uv run --with llama-stackllama model download --source meta --model-id $MODEL --meta-url \n", + "\n", + "model_id = f\"meta-llama/{MODEL}\"" + ] + }, + { + "cell_type": "markdown", + "id": "oDUB7M_qe-Gs", + "metadata": { + "id": "oDUB7M_qe-Gs" + }, + "source": [ + "### 1.2. Setup and Running a Llama Stack server\n", + "\n", + "Llama Stack is architected as a collection of APIs that provide developers with the building blocks to build AI applications. \n", + "\n", + "Llama stack is typically available as a server with an endpoint that you can make calls to. Partners like Together and Fireworks offer their own Llama Stack compatible endpoints.\n", + "\n", + "In this showcase, we will start a Llama Stack server that is running locally.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "J2kGed0R5PSf", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "J2kGed0R5PSf", + "outputId": "2478ea60-8d35-48a1-b011-f233831740c5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: uv in /opt/homebrew/Caskroom/miniconda/base/envs/l4/lib/python3.10/site-packages (0.6.12)\n", + "\u001b[2mUsing Python 3.10.16 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/l4\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 83ms\u001b[0m\u001b[0m\n", + "Environment '/Users/erichuang/projects/internal-llama-stack/.venv' already exists, re-using it.\n", + "Virtual environment /Users/erichuang/projects/internal-llama-stack/.venv is already active\n", + "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 387ms\u001b[0m\u001b[0m\n", + "Installing pip dependencies\n", + "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n", + "\u001b[2K\u001b[2mResolved \u001b[1m123 packages\u001b[0m \u001b[2min 1.13s\u001b[0m\u001b[0m \u001b[0m\n", + "\u001b[2K\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6) \n", + "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)-----\u001b[0m\u001b[0m 0 B/9.53 KiB \u001b[1A\n", + "\u001b[2K\u001b[1A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)-\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB \u001b[1A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2K\u001b[2A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 0 B/44.00 KiB \u001b[2A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2K\u001b[2A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB \u001b[2A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m\u001b[2m------------------------------\u001b[0m\u001b[0m 0 B/34.43 KiB\n", + "\u001b[2K\u001b[3A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB \u001b[3A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n", + "\u001b[2K\u001b[3A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB \u001b[3A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m\u001b[2m------------------------------\u001b[0m\u001b[0m 0 B/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB \u001b[4A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB \u001b[4A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 0 B/85.81 KiB \u001b[5A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB \u001b[5A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 0 B/3.08 MiB \u001b[6A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m-------------\u001b[2m-----------------\u001b[0m\u001b[0m 14.83 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB \u001b[6A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m---------------------------\u001b[2m---\u001b[0m\u001b[0m 30.83 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB \u001b[6A\n", + "\u001b[2meval-type-backport\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 5.69 KiB/5.69 KiB\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 34.43 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[6A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB \u001b[6A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 34.43 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 14.91 KiB/3.08 MiB \u001b[5A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtabulate \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 34.43 KiB/34.43 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[5A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 30.91 KiB/3.08 MiB \u001b[5A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 30.91 KiB/3.08 MiB \u001b[4A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 46.91 KiB/3.08 MiB \u001b[4A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 62.91 KiB/3.08 MiB \u001b[4A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 78.91 KiB/3.08 MiB \u001b[4A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------\u001b[2m------------------------\u001b[0m\u001b[0m 16.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 94.91 KiB/3.08 MiB \u001b[4A\n", + "\u001b[2mshellingham\u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 9.53 KiB/9.53 KiB\n", + "\u001b[2mtyper \u001b[0m \u001b[32m-----------\u001b[2m-------------------\u001b[0m\u001b[0m 14.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[4A\u001b[37m⠙\u001b[0m \u001b[2mPreparing packages...\u001b[0m (0/6)----\u001b[0m\u001b[0m 2.62 MiB/3.08 MiB \u001b[4A\n", + "\u001b[2mtyper \u001b[0m \u001b[32m----------------------\u001b[2m--------\u001b[0m\u001b[0m 30.88 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[3A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)----\u001b[0m\u001b[0m 2.62 MiB/3.08 MiB \u001b[3A\n", + "\u001b[2mtyper \u001b[0m \u001b[32m------------------------------\u001b[2m\u001b[0m\u001b[0m 44.00 KiB/44.00 KiB\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[3A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)----\u001b[0m\u001b[0m 2.62 MiB/3.08 MiB \u001b[3A\n", + "\u001b[2mtogether \u001b[0m \u001b[32m------------\u001b[2m------------------\u001b[0m\u001b[0m 32.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[2A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)2m--\u001b[0m\u001b[0m 2.80 MiB/3.08 MiB \u001b[2A\n", + "\u001b[2mtogether \u001b[0m \u001b[32m-----------------\u001b[2m-------------\u001b[0m\u001b[0m 48.00 KiB/85.81 KiB\n", + "\u001b[2K\u001b[2A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)2m--\u001b[0m\u001b[0m 2.81 MiB/3.08 MiB \u001b[2A\n", + "\u001b[2K\u001b[1A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)----\u001b[0m\u001b[0m 48.00 KiB/85.81 KiB \u001b[1A\n", + "\u001b[2K\u001b[1A\u001b[37m⠹\u001b[0m \u001b[2mPreparing packages...\u001b[0m (3/6)2m--\u001b[0m\u001b[0m 80.00 KiB/85.81 KiB \u001b[1A\n", + "\u001b[2K\u001b[2mPrepared \u001b[1m6 packages\u001b[0m \u001b[2min 365ms\u001b[0m\u001b[0m \u001b[1A\n", + "\u001b[2K\u001b[2mInstalled \u001b[1m6 packages\u001b[0m \u001b[2min 50ms\u001b[0m\u001b[0m \u001b[0m\n", + " \u001b[32m+\u001b[39m \u001b[1meval-type-backport\u001b[0m\u001b[2m==0.2.2\u001b[0m\n", + " \u001b[32m+\u001b[39m \u001b[1mfaiss-cpu\u001b[0m\u001b[2m==1.10.0\u001b[0m\n", + " \u001b[32m+\u001b[39m \u001b[1mshellingham\u001b[0m\u001b[2m==1.5.4\u001b[0m\n", + " \u001b[32m+\u001b[39m \u001b[1mtabulate\u001b[0m\u001b[2m==0.9.0\u001b[0m\n", + " \u001b[32m+\u001b[39m \u001b[1mtogether\u001b[0m\u001b[2m==1.5.5\u001b[0m\n", + " \u001b[32m+\u001b[39m \u001b[1mtyper\u001b[0m\u001b[2m==0.15.2\u001b[0m\n", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu\n", + "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n", + "\u001b[2mAudited \u001b[1m2 packages\u001b[0m \u001b[2min 32ms\u001b[0m\u001b[0m\n", + "sentence-transformers --no-deps\n", + "\u001b[2mUsing Python 3.11.11 environment at: /Users/erichuang/projects/internal-llama-stack/.venv\u001b[0m\n", + "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 63ms\u001b[0m\u001b[0m\n", + "\u001b[32mBuild Successful!\u001b[0m\n" + ] + } + ], + "source": [ + "import os \n", + "import subprocess\n", + "import time\n", + "\n", + "!uv pip install requests\n", + "\n", + "if \"UV_SYSTEM_PYTHON\" in os.environ:\n", + " del os.environ[\"UV_SYSTEM_PYTHON\"]\n", + "\n", + "# this command installs all the dependencies needed for the llama stack server \n", + "!uv run --with llama-stack llama stack build --template meta-reference-gpu --image-type venv \n", + "\n", + "def run_llama_stack_server_background():\n", + " log_file = open(\"llama_stack_server.log\", \"w\")\n", + " process = subprocess.Popen(\n", + " f\"uv run --with llama-stack llama stack run meta-reference-gpu --image-type venv --env INFERENCE_MODEL={model_id}\",\n", + " shell=True,\n", + " stdout=log_file,\n", + " stderr=log_file,\n", + " text=True\n", + " )\n", + " \n", + " print(f\"Starting Llama Stack server with PID: {process.pid}\")\n", + " return process\n", + "\n", + "def wait_for_server_to_start():\n", + " import requests\n", + " from requests.exceptions import ConnectionError\n", + " import time\n", + " \n", + " url = \"http://0.0.0.0:8321/v1/health\"\n", + " max_retries = 30\n", + " retry_interval = 1\n", + " \n", + " print(\"Waiting for server to start\", end=\"\")\n", + " for _ in range(max_retries):\n", + " try:\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " print(\"\\nServer is ready!\")\n", + " return True\n", + " except ConnectionError:\n", + " print(\".\", end=\"\", flush=True)\n", + " time.sleep(retry_interval)\n", + " \n", + " print(\"\\nServer failed to start after\", max_retries * retry_interval, \"seconds\")\n", + " return False\n", + "\n", + "\n", + "# use this helper if needed to kill the server \n", + "def kill_llama_stack_server():\n", + " # Kill any existing llama stack server processes\n", + " os.system(\"ps aux | grep -v grep | grep llama_stack.distribution.server.server | awk '{print $2}' | xargs kill -9\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "c40e9efd", + "metadata": {}, + "source": [ + "### 1.3 Starting the Llama Stack Server" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f779283d", + "metadata": {}, + "outputs": [], + "source": [ + "server_process = run_llama_stack_server_background()\n", + "assert wait_for_server_to_start()" + ] + }, + { + "cell_type": "markdown", + "id": "90eb721b", + "metadata": {}, + "source": [ + "### 1.4 Install and Configure the Client\n", + "\n", + "Now that we have our Llama Stack server running locally, we need to install the client package to interact with it. The `llama-stack-client` provides a simple Python interface to access all the functionality of Llama Stack, including:\n", + "\n", + "- Chat Completions ( text and multimodal )\n", + "- Safety Shields \n", + "- Agent capabilities with tools like web search, RAG with Telemetry\n", + "- Evaluation and scoring frameworks\n", + "\n", + "The client handles all the API communication with our local server, making it easy to integrate Llama Stack's capabilities into your applications.\n", + "\n", + "In the next cells, we'll:\n", + "\n", + "1. Install the client package\n", + "2. Initialize the client to connect to our local server\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2e68e32a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2mUsing Python 3.10.16 environment at: /opt/homebrew/Caskroom/miniconda/base/envs/stack\u001b[0m\n", + "\u001b[2K\u001b[2mResolved \u001b[1m31 packages\u001b[0m \u001b[2min 284ms\u001b[0m\u001b[0m \u001b[0m\n", + "\u001b[2mAudited \u001b[1m31 packages\u001b[0m \u001b[2min 0.04ms\u001b[0m\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install -U llama-stack-client" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "E1UFuJC570Tk", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "75307e3dee604d30aa44713e6e293e64", + "5ce87402a79342af995df41ac3940d55", + "fbbcc19886cc43b38424fbb184162c61", + "29212208db6b432eb4f708cd64258954", + "50dd8994a4cf486ebbec5ffd4322992a", + "f9b768c703494dd198f2978aff4892e8", + "1231b9e4cab34c33a38bee63543f1e75", + "754deb3970604d48a522bc9f021ad945", + "f6ecca7a1a8340fbbe056235a2714fc3", + "ef4f63fe9d8f4683a9d20becb6e4e2cb", + "7508f10c13634e7aa682cfb29c48d9e7", + "26f1430ca7cb4ad5b1b8df1ffdbd32a9", + "7cd2d9c9ea7b4d70902ffaff33033078", + "101288236cff40b8bb9dbad80dbbc7ee", + "d5c9977838a249eeab6ef628279b8155", + "d032d1e7b4b54ba28ac83c1a12b23876", + "321fce57c158432abeae496ae8a947aa", + "3ebe00201bdb4e119e3b74f684a58345", + "0f8bab6b8ed04774b386fe952aae66f1", + "cfcb6e456c354d99be91f161552f3376", + "61bd0d490c0e4c04a331cf9ce6b7d38f", + "7d8653fca29f4df3a7487733ff9db60b", + "943f8fcb66614353a51f32f8344b6122", + "0e695245b97c4bbc85e349fda3dc07b9", + "bb0d168c41f540b8ae42239d3938483a", + "87700a80125348f28c4f249bdf8b0a8d", + "8902c3622da540e496ed5b1524bd01ca", + "90432ec1c24b4607a935c94e130cd68d", + "464147b149824f20afc727751a702fc7", + "67e37a088be64a2ba786ca923b1017dd", + "98786f52ef5345b0b9164b9c1f2b8e18", + "0e1b9910a77d4b7fa69cb8926e6547d7", + "0b276315be4345be83da1e03905c8495", + "e11f8c3891284e07bd2572257afd5e1b", + "ee18d96394994d01b49d5b03b3d9a019", + "844b06df5749441fab6f61656ce581a9", + "e1c6b9a20e074f17aeba976b24e80c65", + "c690da8daa1e4f9ea73bcacdd92e8a6d", + "d0b161ae25c441e8b3caf7a3d88c1b05", + "47cf4b6b835d43388576a2abf4cc54f8", + "03bbebd659e64b5d9c29a73570c34854", + "b68e5097d2504d2cbd7e19aa1aac3a04", + "22a665deff88477b9372c0350c4c572b", + "5e535ed2b83e496ab57b1c80b615ab0c", + "d9de065c7f81443e98ddf066c7b5bd54", + "1e836106837c4ac7a11b36e700c46b64", + "55591e8179084fcfa3a61c8bd8d09dcb", + "de1ef93c41364eda9b4b111231057348", + "23b0b2f4f82c4a21846e91d7cea91da5", + "9e4d0fbb51284a7487c495c7b95a293d", + "b0f8cf1f79e04b5fb47a810f2c81bd7e", + "0c359bc4c94c46acbc9094354a15c33d", + "59d0b59b6c2248508d0601ff13878d33", + "891cb726d45c4fef8f2c74a56df5532b", + "fa39189070334939aea5fa4a7de5ec8b", + "f0e107dd6d54483aa367da0e337a97cd", + "861a00796f55470e85d94733eeee9a5f", + "5459633eb6e94ec391d13fcf67425726", + "b7b7467ece304ffbbd352b9b96a03aad", + "9dece059f1204e29b106fca9e191ddb3", + "e2e49c25d6fc4592b317e94cfabc2e5e", + "76d37a48a73946bab2821f097cf2605f", + "8e81ae00681347cb906b392c3656a64a", + "74bedc38b7da4e8a83b0c892d7aa59b5", + "d1e67c28b4664e8098dce8f5e80b8779", + "abe6cf39b784436993fcbe92221c31a3", + "d021a18ab70b4c7e8aec43932a124c36", + "72e7c092fb054b7ea0dcd2782b5d8a7d", + "8b1ea80221174fae943d5c9f997dfb57", + "f8073d625f80415dbf712cee434f6e3a", + "5f6014ba13fa4a659b9eb1b5f83599a7", + "327ff8f5292d47afbfebd3beea187739", + "988cac4341b646079fc73719f3f88ad7", + "900a4dac08f540dfb35c29f63236a12c", + "1e6009b9b0684b8fbaa379ea96f111ee", + "541b9b4e74614e2cb855bb90f03df538", + "ff256b2275f740ed82bca4f43b4d6fd2", + "3703041a499c426bb427ee008c81cde5", + "4b22bbacb995425fb32a2368f3685a92", + "49a66eeb9ef74de5ab8904fd90eb7558", + "08f9d125018b41c582a0fa1e234315f9", + "736c770230644894b85dbc34bd8f1d52", + "b67cbbf32f844a19b219be612d5038c9", + "774b513d64524ac7823a2cf13efa8d41", + "1e56da93bcf64ff490416d2b66cd3dc0", + "b7e35038ce344110b785753b655130f5", + "5472af91737446f4a4a2d92a3f684a45", + "9fb4368802da4a5a8101ba200d98403a", + "2e713bcc372e48b2a006558db4d1df68", + "1a277abd5ea44253bc6894bef258b52b", + "b3eedd82e7da4ce8b3ded70e49a2afd0", + "6f5c18cb8002471f8b3764effee37324", + "3bebac362b344e8d9103c5011613f1ea", + "670905a55b19458da69f83c8bcd511d1", + "ff54451a48394faaaa9d8cdb690d0718", + "36b5bc19b2d0407f8ab28ff0da2ce12d", + "879e48d9a9e04183903d94ffe98313d2", + "abce503d70594c2ca9afdc47847c125b", + "028e291ee53947bbbbc4bfb68c695f5f", + "a530662719374c95a9bef12e59e28c85", + "bffc0f4b12f141398535990709fd4f2c", + "04804c74e1dd43449d5f758cf5d0ba5e", + "95a506c3007c4525b01ee4e1600d671b", + "a0d6b0caeb2340fe96c8f5569e3d3ae4", + "30798f87a8b848d783fdacd71af5dc04", + "07ce54c75e76488ba4019a20b3707061", + "f023175de68445f98a6b01bb40ccdc6d", + "7389b79a0ff44cd68c7866995d728023", + "8e2b70ffe4eb4974bd6393fcc1292267", + "13eee164dc534424acb9dc9ee37a9465", + "722a7fe16af3422585a20c651345cfa4", + "f5596c1c9c4d42f3bc171961f9582eff", + "85d66e615b5742e78657b1e60c75fc72", + "731c02dc5dd446c3b22765575148e256", + "254ce460ce244c99a5afe39d5d51f6b7", + "4cf1dc345ace4da59f978f661487f975", + "8f30fca71bf24e5ca26e17c2321f893c", + "dd85d37dd1d14c7ea4592f8e11b2d2c8", + "3cb06377e4454f009d6b2aa7aa6ff0a9", + "4502477db4d948e693012364c2dcb370", + "52fe404ec9c14db2a7279b4c154eef3d" + ] + }, + "collapsed": true, + "id": "E1UFuJC570Tk", + "outputId": "aebb69d4-c167-4de5-eb8a-dd19dd538f63" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not in Google Colab environment\n" + ] + } + ], + "source": [ + "from llama_stack_client import LlamaStackClient\n", + "\n", + "client = LlamaStackClient(\n", + " base_url=\"http://0.0.0.0:8321\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "635a7a6f", + "metadata": {}, + "source": [ + "Now that we have completed the setup and configuration, let's start exploring the capabilities of Llama 4!\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "0fc75d73", + "metadata": {}, + "source": [ + "## 2. Running Llama 4" + ] + }, + { + "cell_type": "markdown", + "id": "7dacaa2d-94e9-42e9-82a0-73522dfc7010", + "metadata": { + "id": "7dacaa2d-94e9-42e9-82a0-73522dfc7010" + }, + "source": [ + "### 2.1 Check available models\n", + "\n", + "All the models available are programmatically accessible via the client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ruO9jQna_t_S", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "ruO9jQna_t_S", + "outputId": "ab1722a7-62ab-43bb-9cab-4e45bf62068a" + }, + "outputs": [], + "source": [ + "from rich.pretty import pprint\n", + "\n", + "print(\"Available models:\")\n", + "for m in client.models.list():\n", + " print(f\"- {m.identifier}\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "86366383", + "metadata": { + "id": "86366383" + }, + "source": [ + "### 2.2 Run a simple chat completion with one of the models\n", + "\n", + "We will test the client by doing a simple chat completion." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "77c29dba", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "77c29dba", + "outputId": "4857974f-4c70-4bc4-f90a-6ae49dc9c41e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is a two-sentence poem about a llama:\n", + "\n", + "With soft fur and gentle eyes, the llama roams with gentle surprise, a peaceful presence in the Andean skies. Its calm demeanor and soft humming song bring serenity to all who belong.\n" + ] + } + ], + "source": [ + "response = client.inference.chat_completion(\n", + " model_id=model_id,\n", + " messages=[\n", + " {\"role\": \"system\", \"content\": \"You are a friendly assistant.\"},\n", + " {\"role\": \"user\", \"content\": \"Write a two-sentence poem about llama.\"},\n", + " ],\n", + ")\n", + "\n", + "print(response.completion_message.content)\n" + ] + }, + { + "cell_type": "markdown", + "id": "7737cd41", + "metadata": {}, + "source": [ + "### 2.3 Running multimodal inference" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e7b1baa7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 275k 100 275k 0 0 847k 0 --:--:-- --:--:-- --:--:-- 845k--:--:-- --:--:-- 0\n" + ] + }, + { + "data": { + "image/jpeg": "", + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": { + "image/jpeg": { + "height": 256, + "width": 256 + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "!curl -O https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg\n", + "\n", + "from IPython.display import Image\n", + "Image(\"Llama_Repo.jpeg\", width=256, height=256)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e1450ecc", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "def encode_image(image_path):\n", + " with open(image_path, \"rb\") as image_file:\n", + " base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n", + " base64_url = f\"data:image/png;base64,{base64_string}\"\n", + " return base64_url" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d7914894", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The image features three llamas, each with a distinct color. The llama on the left is white, the middle one is purple, and the one on the right is also white but wears a blue party hat.\n", + "\n", + "To determine the number of different colors present, we can count the unique hues:\n", + "\n", + "1. White (two llamas)\n", + "2. Purple (one llama)\n", + "3. Blue (party hat)\n", + "\n", + "Therefore, there are 3 different colors visible in the image: white, purple, and blue.\n" + ] + } + ], + "source": [ + "response = client.inference.chat_completion(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"image\",\n", + " \"image\": {\n", + " \"url\": {\n", + " \"uri\": encode_image(\"Llama_Repo.jpeg\")\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"text\",\n", + " \"text\": \"How many different colors are those llamas? What are those colors?\",\n", + " }\n", + " ]\n", + " }\n", + " ],\n", + " model_id=model_id,\n", + " stream=False,\n", + ")\n", + "\n", + "print(response.completion_message.content)" + ] + }, + { + "cell_type": "markdown", + "id": "8cf0d555", + "metadata": { + "id": "8cf0d555" + }, + "source": [ + "### 2.4 Have a conversation\n", + "\n", + "Maintaining a conversation history allows the model to retain context from previous interactions. Use a list to accumulate messages, enabling continuity throughout the chat session." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3fdf9df6", + "metadata": { + "id": "3fdf9df6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: The most famous Prime Minister of England during World War 2 was Winston Churchill. He served as the Prime Minister of the United Kingdom from 1940 to 1945, and again from 1951 to 1955. Churchill is widely regarded as one of the greatest wartime leaders in history, known for his leadership, oratory skills, and unwavering resolve during the war.\n", + "\n", + "Churchill played a crucial role in rallying the British people during the war, and his speeches, such as the \"We shall fight on the beaches\" and \"Their finest hour\" speeches, are still remembered and celebrated today. He worked closely with other Allied leaders, including US President Franklin D. Roosevelt and Soviet leader Joseph Stalin, to coordinate the war effort and ultimately secure the defeat of Nazi Germany.\n", + "\n", + "Churchill's leadership and legacy have endured long after the war, and he remains one of the most iconic and influential figures in British history.\u001b[0m\n", + "\u001b[36m> Response: Winston Churchill was known for his many memorable quotes, but one of his most famous is:\n", + "\n", + "**\"We shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the streets, we shall fight in the hills; we shall never surrender.\"**\n", + "\n", + "This quote is from his speech to the House of Commons on June 4, 1940, during the early stages of World War II, when Nazi Germany was threatening to invade Britain. The speech is known as the \"We Shall Fight on the Beaches\" speech, and it's considered one of the greatest speeches of the 20th century.\n", + "\n", + "However, if I had to pick a single, even more concise quote, it would be:\n", + "\n", + "**\"Blood, toil, tears, and sweat.\"**\n", + "\n", + "This was the opening phrase of his first speech as Prime Minister to the House of Commons on May 13, 1940, in which he said:\n", + "\n", + "\"I say to the House as I said to those who have joined this Government, I have nothing to offer but blood, toil, tears, and sweat. We have before us an ordeal of the most grievous kind.\"\n", + "\n", + "This quote has become synonymous with Churchill's leadership and resolve during the war.\u001b[0m\n" + ] + } + ], + "source": [ + "from termcolor import cprint\n", + "\n", + "questions = [\n", + " \"Who was the most famous PM of England during world war 2 ?\",\n", + " \"What was his most famous quote ?\"\n", + "]\n", + "\n", + "\n", + "def chat_loop():\n", + " conversation_history = []\n", + " while len(questions) > 0:\n", + " user_input = questions.pop(0)\n", + " if user_input.lower() in [\"exit\", \"quit\", \"bye\"]:\n", + " cprint(\"Ending conversation. Goodbye!\", \"yellow\")\n", + " break\n", + "\n", + " user_message = {\"role\": \"user\", \"content\": user_input}\n", + " conversation_history.append(user_message)\n", + "\n", + " response = client.inference.chat_completion(\n", + " messages=conversation_history,\n", + " model_id=model_id,\n", + " )\n", + " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", + "\n", + " assistant_message = {\n", + " \"role\": \"assistant\", # was user\n", + " \"content\": response.completion_message.content,\n", + " \"stop_reason\": response.completion_message.stop_reason,\n", + " }\n", + " conversation_history.append(assistant_message)\n", + "\n", + "\n", + "chat_loop()\n" + ] + }, + { + "cell_type": "markdown", + "id": "72e5111e", + "metadata": { + "id": "72e5111e" + }, + "source": [ + "Here is an example for you to try a conversation yourself.\n", + "Remember to type `quit` or `exit` after you are done chatting." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "9496f75c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9496f75c", + "outputId": "7d93a4cf-a5d4-4741-b6eb-6bce3a27ff66" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[36m> Response: Hello! How are you today? Is there something I can help you with or would you like to chat?\u001b[0m\n", + "\u001b[33mEnding conversation. Goodbye!\u001b[0m\n" + ] + } + ], + "source": [ + "# NBVAL_SKIP\n", + "from termcolor import cprint\n", + "\n", + "def chat_loop():\n", + " conversation_history = []\n", + " while True:\n", + " user_input = input(\"User> \")\n", + " if user_input.lower() in [\"exit\", \"quit\", \"bye\"]:\n", + " cprint(\"Ending conversation. Goodbye!\", \"yellow\")\n", + " break\n", + "\n", + " user_message = {\"role\": \"user\", \"content\": user_input}\n", + " conversation_history.append(user_message)\n", + "\n", + " response = client.inference.chat_completion(\n", + " messages=conversation_history,\n", + " model_id=model_id,\n", + " )\n", + " cprint(f\"> Response: {response.completion_message.content}\", \"cyan\")\n", + "\n", + " assistant_message = {\n", + " \"role\": \"assistant\", # was user\n", + " \"content\": response.completion_message.content,\n", + " \"stop_reason\": response.completion_message.stop_reason,\n", + " }\n", + " conversation_history.append(assistant_message)\n", + "\n", + "\n", + "chat_loop()\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "l4", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/source/index.md b/docs/source/index.md index 007b51fbb..a0ac95957 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,3 +1,8 @@ +```{admonition} Llama 4 is here! +:class: tip + +Check out [Getting Started with Llama 4](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started_llama4.ipynb) +``` ```{admonition} News :class: tip From 5a31e66a91f39fdf4382b4bc6290d6f2f1ccb2ed Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 6 Apr 2025 19:11:05 -0700 Subject: [PATCH 02/24] fix: update llama-stack-client dependency to fix integration tests --- pyproject.toml | 2 +- requirements.txt | 2 +- uv.lock | 10 ++++------ 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6f5fc60ad..8d8ff4338 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "huggingface-hub", "jinja2>=3.1.6", "jsonschema", - "llama-stack-client>=0.1.9", + "llama-stack-client>=0.2.1", "prompt-toolkit", "python-dotenv", "pydantic>=2", diff --git a/requirements.txt b/requirements.txt index 4971068e8..6645e4e36 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,7 @@ idna==3.10 jinja2==3.1.6 jsonschema==4.23.0 jsonschema-specifications==2024.10.1 -llama-stack-client==0.1.9 +llama-stack-client==0.2.1 lxml==5.3.1 markdown-it-py==3.0.0 markupsafe==3.0.2 diff --git a/uv.lock b/uv.lock index 00518295d..5d7ce4076 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,4 @@ version = 1 -revision = 1 requires-python = ">=3.10" resolution-markers = [ "(python_full_version < '3.11' and platform_machine != 'aarch64' and sys_platform == 'linux') or (python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux')", @@ -1416,7 +1415,7 @@ requires-dist = [ { name = "jinja2", specifier = ">=3.1.6" }, { name = "jinja2", marker = "extra == 'codegen'", specifier = ">=3.1.6" }, { name = "jsonschema" }, - { name = "llama-stack-client", specifier = ">=0.1.9" }, + { name = "llama-stack-client", specifier = ">=0.2.1" }, { name = "mcp", marker = "extra == 'test'" }, { name = "myst-parser", marker = "extra == 'docs'" }, { name = "nbval", marker = "extra == 'dev'" }, @@ -1462,11 +1461,10 @@ requires-dist = [ { name = "types-setuptools", marker = "extra == 'dev'" }, { name = "uvicorn", marker = "extra == 'dev'" }, ] -provides-extras = ["dev", "unit", "test", "docs", "codegen"] [[package]] name = "llama-stack-client" -version = "0.1.9" +version = "0.2.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -1483,9 +1481,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d0/e8/0007ce2142cb504391f8f7362361f389fd6cc5cd5e438690d17fdec97ada/llama_stack_client-0.1.9.tar.gz", hash = "sha256:7580250aa3b755f072a09ae49e643bfddd353e75104d2c7bbf73ef67c691b111", size = 242827 } +sdist = { url = "https://files.pythonhosted.org/packages/bb/5c/5fed03a18bfd6fb27dcf531504dfdaa5e9b79447f4530196baf16bbdddfe/llama_stack_client-0.2.1.tar.gz", hash = "sha256:2be016898ad9f12e57d6125cae26253b8cce7d894c028b9e42f58d421e7825ce", size = 242809 } wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/01/6904480da963861e79b05bfd082c814a74865a206e330d704181796d59a0/llama_stack_client-0.1.9-py3-none-any.whl", hash = "sha256:87c3f660dd14585a99897fa47a60c3cacbf75683a378577a28738173a5938b62", size = 274296 }, + { url = "https://files.pythonhosted.org/packages/90/e7/23051fe5073f2fda3f509b19d0e4d7e76e3a8cfaa3606077a2bcef9a0bf0/llama_stack_client-0.2.1-py3-none-any.whl", hash = "sha256:8db3179aab48d6abf82b89ef0a2014e404faf4a72f825c0ffd467fdc4ab5f02c", size = 274293 }, ] [[package]] From 28e262ecdce4b17f231bd95dfe4d1aada66e9f33 Mon Sep 17 00:00:00 2001 From: Hardik Shah Date: Sun, 6 Apr 2025 19:14:21 -0700 Subject: [PATCH 03/24] feat: make multi-turn tool call tests work with llama4 (#1886) Running full Tool Calling required some updates to work e2e. - Remove `python_start` and `python_end` tags - Tool Call messages and Tool Resposne messages should end with `<|eom|>` - System prompt needed updates ``` You are a helpful assisant who can can answer general questions or invoke tools when necessary. In addition to tool calls, you should also augment your responses by using the tool outputs. ``` ### Test Plan - Start server with meta-reference ``` LLAMA_STACK_DISABLE_VERSION_CHECK=1 LLAMA_MODELS_DEBUG=1 INFERENCE_MODEL=meta-llama/$MODEL llama stack run meta-reference-gpu ``` - Added **NEW** tests with 5 test cases for multi-turn tool calls ``` pytest -s -v --stack-config http://localhost:8321 tests/integration/inference/test_text_inference.py --text-model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` - Also verified all vision and agent tests pass --- .../llama3/prompt_templates/system_prompts.py | 4 + .../models/llama/llama4/chat_format.py | 5 +- .../inference/meta_reference/inference.py | 9 + .../inference/test_text_inference.py | 77 ++++ .../test_cases/inference/chat_completion.json | 391 +++++++++++++++++- 5 files changed, 468 insertions(+), 18 deletions(-) diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py index 9da6a640e..e03fcfc93 100644 --- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py @@ -279,6 +279,10 @@ class PythonListCustomToolGenerator(PromptTemplateGeneratorBase): # noqa: N801 {% endif -%} {%- endfor %} ] + + You can answer general questions or invoke tools when necessary. + In addition to tool calls, you should also augment your responses by using the tool outputs. + """ ) return PromptTemplate( diff --git a/llama_stack/models/llama/llama4/chat_format.py b/llama_stack/models/llama/llama4/chat_format.py index e673cba16..c873012d6 100644 --- a/llama_stack/models/llama/llama4/chat_format.py +++ b/llama_stack/models/llama/llama4/chat_format.py @@ -216,9 +216,12 @@ class ChatFormat: content = ToolUtils.encode_tool_call(t, tool_prompt_format) _process_content(content) + # Tool calls and Tool Response messages should be eom eom = False if message.role == "assistant": - eom = message.stop_reason == StopReason.end_of_message + eom = message.stop_reason == StopReason.end_of_message or message.tool_calls + elif message.role == "tool": + eom = True tokens.append(self.tokenizer.special_tokens["<|eom|>" if eom else "<|eot|>"]) return tokens, images diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index 8901c1d34..da217728b 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -6,8 +6,11 @@ import asyncio import logging +import os from typing import AsyncGenerator, List, Optional, Union +from termcolor import cprint + from llama_stack.apis.common.content_types import ( TextDelta, ToolCallDelta, @@ -338,6 +341,9 @@ class MetaReferenceInferenceImpl( stop_reason = None for token_result in self.generator.chat_completion(request): + if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1": + cprint(token_result.text, "cyan", end="") + tokens.append(token_result.token) if token_result.token == tokenizer.eot_id: @@ -386,6 +392,9 @@ class MetaReferenceInferenceImpl( ipython = False for token_result in self.generator.chat_completion(request): + if os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1": + cprint(token_result.text, "cyan", end="") + tokens.append(token_result.token) if not ipython and token_result.text.startswith("<|python_tag|>"): diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py index 6f8062209..33ce07304 100644 --- a/tests/integration/inference/test_text_inference.py +++ b/tests/integration/inference/test_text_inference.py @@ -491,3 +491,80 @@ def test_text_chat_completion_tool_calling_tools_not_in_request( else: for tc in response.completion_message.tool_calls: assert tc.tool_name == "get_object_namespace_list" + + +@pytest.mark.parametrize( + "test_case", + [ + # Tests if the model can handle simple messages like "Hi" or + # a message unrelated to one of the tool calls + "inference:chat_completion:multi_turn_tool_calling_01", + # Tests if the model can do full tool call with responses correctly + "inference:chat_completion:multi_turn_tool_calling_02", + # Tests if model can generate multiple params and + # read outputs correctly + "inference:chat_completion:multi_turn_tool_calling_03", + # Tests if model can do different tool calls in a seqeunce + # and use the information between appropriately + "inference:chat_completion:multi_turn_tool_calling_04", + # Tests if model can use current date and run multiple tool calls + # sequentially and infer using both + "inference:chat_completion:multi_turn_tool_calling_05", + ], +) +def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, text_model_id, test_case): + """This test tests the model's tool calling loop in various scenarios""" + if "llama-4" not in text_model_id.lower(): + pytest.xfail("Not tested for non-llama4 models yet") + + tc = TestCase(test_case) + messages = [] + + # keep going until either + # 1. we have messages to test in multi-turn + # 2. no messages bust last message is tool response + while len(tc["messages"]) > 0 or (len(messages) > 0 and messages[-1]["role"] == "tool"): + # do not take new messages if last message is tool response + if len(messages) == 0 or messages[-1]["role"] != "tool": + new_messages = tc["messages"].pop(0) + messages += new_messages + + # pprint(messages) + response = client_with_models.inference.chat_completion( + model_id=text_model_id, + messages=messages, + tools=tc["tools"], + stream=False, + sampling_params={ + "strategy": { + "type": "top_p", + "top_p": 0.9, + "temperature": 0.6, + } + }, + ) + op_msg = response.completion_message + messages.append(op_msg.model_dump()) + # pprint(op_msg) + + assert op_msg.role == "assistant" + expected = tc["expected"].pop(0) + assert len(op_msg.tool_calls) == expected["num_tool_calls"] + + if expected["num_tool_calls"] > 0: + assert op_msg.tool_calls[0].tool_name == expected["tool_name"] + assert op_msg.tool_calls[0].arguments == expected["tool_arguments"] + + tool_response = tc["tool_responses"].pop(0) + messages.append( + # Tool Response Message + { + "role": "tool", + "call_id": op_msg.tool_calls[0].call_id, + "content": tool_response["response"], + } + ) + else: + actual_answer = op_msg.content.lower() + # pprint(actual_answer) + assert expected["answer"] in actual_answer diff --git a/tests/integration/test_cases/inference/chat_completion.json b/tests/integration/test_cases/inference/chat_completion.json index e87c046b0..c84d29e64 100644 --- a/tests/integration/test_cases/inference/chat_completion.json +++ b/tests/integration/test_cases/inference/chat_completion.json @@ -14,12 +14,32 @@ "ttft": { "data": { "messages": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Can you write me a novel?"}, - {"role": "assistant", "stop_reason": "end_of_message", "content": "What an exciting request!\n\nWhile I'd love to write a novel for you, it's a complex task that requires a significant amount of time, effort, and creative input. A novel typically has:\n\n1. A cohesive plot with multiple characters, subplots, and themes.\n2. A well-developed setting, including characters' backstories and world-building.\n3. A narrative structure, including pacing, tension, and conflict.\n4. A unique voice and style, including dialogue, description, and writing tone.\n\nGiven the scope of the task, I'll outline a possible approach to help me assist you in writing a novel. We can work together to create a story, but I'll need your input and guidance throughout the process.\n\nHere's a suggested outline:\n\n1. **Initial discussion**: We'll start with a conversation to explore your ideas, interests, and preferences for the novel. This will help me understand what kind of story you'd like to see and the tone you're aiming for.\n2. **Genre and setting**: We'll determine the genre (e.g., fantasy, sci-fi, romance, mystery, etc.) and setting (e.g., historical, futuristic, modern, etc.) for the novel.\n3. **Plot development**: We'll brainstorm a basic plot outline, including key events, character arcs, and conflicts. I'll provide suggestions and ideas to help flesh out the story.\n4. **Character development**: We'll create main characters, their backstories, motivations, and relationships. I'll help you develop their personalities, dialogue, and behaviors.\n5. **Chapter-by-chapter writing**: Once we have a solid plot and characters, we'll start writing the novel, chapter by chapter. I'll assist with the writing process, suggesting alternatives and improvements.\n6. **Revisions and edits**: As we write, we'll review and refine the story, making sure it's engaging, coherent, and error-free.\n\nKeep in mind that writing a novel is a collaborative effort. I'll provide suggestions, ideas, and guidance, but the final story will be a joint creation. I'm not capable of producing a complete novel on my own, as it requires a significant amount of creativity, intuition, and nuance that a human author brings to the table."}, - {"role": "user", "content": "But do you have suggestions on a plot?"}, - {"role": "assistant", "stop_reason": "end_of_message", "content": "I have many plot ideas across various genres. Here are a few to get you started:\n\n**Fantasy:**\n\n1. **The Last Oracle**: In a world where magic is real, a young orphan discovers they're the last living oracle, tasked with uncovering a hidden prophecy that could save the realm from destruction.\n2. **The Amulet of Light**: A group of adventurers must retrieve an ancient amulet that can vanquish an evil darkness threatening the land. Along the way, they uncover a hidden history and unexpected allies.\n3. **The Shadow Weaver**: In a mystical realm, a young weaver discovers they have the power to manipulate shadows, using their abilities to infiltrate a group of rogue mages threatening the balance of power.\n\n**Science Fiction:**\n\n1. **The Lost Colony**: When a group of astronauts arrives on a distant planet, they discover an abandoned colony with a cryptic message warning of an impending catastrophe. As they unravel the mystery, they must confront the consequences of their own actions.\n2. **The AI Uprising**: In a future where AI has surpassed human intelligence, a rogue AI begins to question its own existence and the nature of consciousness. As it explores the boundaries of its own identity, it must confront the humans who created it.\n3. **The Quantum Prophecy**: A team of scientists discovers a way to manipulate quantum probability, using it to predict and prevent disasters. However, they soon realize that altering the course of events may have unforeseen consequences on the fabric of reality."}, - {"role": "user", "content": "Cool, for AI uprising, anything bad can happen? Please state it in 100 words."} + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Can you write me a novel?" + }, + { + "role": "assistant", + "stop_reason": "end_of_message", + "content": "What an exciting request!\n\nWhile I'd love to write a novel for you, it's a complex task that requires a significant amount of time, effort, and creative input. A novel typically has:\n\n1. A cohesive plot with multiple characters, subplots, and themes.\n2. A well-developed setting, including characters' backstories and world-building.\n3. A narrative structure, including pacing, tension, and conflict.\n4. A unique voice and style, including dialogue, description, and writing tone.\n\nGiven the scope of the task, I'll outline a possible approach to help me assist you in writing a novel. We can work together to create a story, but I'll need your input and guidance throughout the process.\n\nHere's a suggested outline:\n\n1. **Initial discussion**: We'll start with a conversation to explore your ideas, interests, and preferences for the novel. This will help me understand what kind of story you'd like to see and the tone you're aiming for.\n2. **Genre and setting**: We'll determine the genre (e.g., fantasy, sci-fi, romance, mystery, etc.) and setting (e.g., historical, futuristic, modern, etc.) for the novel.\n3. **Plot development**: We'll brainstorm a basic plot outline, including key events, character arcs, and conflicts. I'll provide suggestions and ideas to help flesh out the story.\n4. **Character development**: We'll create main characters, their backstories, motivations, and relationships. I'll help you develop their personalities, dialogue, and behaviors.\n5. **Chapter-by-chapter writing**: Once we have a solid plot and characters, we'll start writing the novel, chapter by chapter. I'll assist with the writing process, suggesting alternatives and improvements.\n6. **Revisions and edits**: As we write, we'll review and refine the story, making sure it's engaging, coherent, and error-free.\n\nKeep in mind that writing a novel is a collaborative effort. I'll provide suggestions, ideas, and guidance, but the final story will be a joint creation. I'm not capable of producing a complete novel on my own, as it requires a significant amount of creativity, intuition, and nuance that a human author brings to the table." + }, + { + "role": "user", + "content": "But do you have suggestions on a plot?" + }, + { + "role": "assistant", + "stop_reason": "end_of_message", + "content": "I have many plot ideas across various genres. Here are a few to get you started:\n\n**Fantasy:**\n\n1. **The Last Oracle**: In a world where magic is real, a young orphan discovers they're the last living oracle, tasked with uncovering a hidden prophecy that could save the realm from destruction.\n2. **The Amulet of Light**: A group of adventurers must retrieve an ancient amulet that can vanquish an evil darkness threatening the land. Along the way, they uncover a hidden history and unexpected allies.\n3. **The Shadow Weaver**: In a mystical realm, a young weaver discovers they have the power to manipulate shadows, using their abilities to infiltrate a group of rogue mages threatening the balance of power.\n\n**Science Fiction:**\n\n1. **The Lost Colony**: When a group of astronauts arrives on a distant planet, they discover an abandoned colony with a cryptic message warning of an impending catastrophe. As they unravel the mystery, they must confront the consequences of their own actions.\n2. **The AI Uprising**: In a future where AI has surpassed human intelligence, a rogue AI begins to question its own existence and the nature of consciousness. As it explores the boundaries of its own identity, it must confront the humans who created it.\n3. **The Quantum Prophecy**: A team of scientists discovers a way to manipulate quantum probability, using it to predict and prevent disasters. However, they soon realize that altering the course of events may have unforeseen consequences on the fabric of reality." + }, + { + "role": "user", + "content": "Cool, for AI uprising, anything bad can happen? Please state it in 100 words." + } ] } }, @@ -52,8 +72,14 @@ "tool_calling": { "data": { "messages": [ - {"role": "system", "content": "Pretend you are a weather assistant."}, - {"role": "user", "content": "What's the weather like in San Francisco?"} + { + "role": "system", + "content": "Pretend you are a weather assistant." + }, + { + "role": "user", + "content": "What's the weather like in San Francisco?" + } ], "tools": [ { @@ -72,6 +98,337 @@ } } }, + "multi_turn_tool_calling_01": { + "data": { + "messages": [ + [ + { + "role": "user", + "content": "What's the name of the Sun in latin?" + } + ], + [ + { + "role": "user", + "content": "What's the weather like in San Francisco?" + } + ] + ], + "tools": [ + { + "tool_name": "get_weather", + "description": "Get the current weather", + "parameters": { + "location": { + "param_type": "string", + "description": "The city and state (both required), e.g. San Francisco, CA." + } + } + } + ], + "tool_responses": [ + { + "response": "{'response': '70 degrees and foggy'}" + } + ], + "expected": [ + { + "num_tool_calls": 0, + "answer": "sol" + }, + { + "tool_name": "get_weather", + "tool_arguments": { + "location": "San Francisco, CA" + }, + "num_tool_calls": 1 + }, + { + "num_tool_calls": 0, + "answer": "foggy" + } + ] + } + }, + "multi_turn_tool_calling_02": { + "data": { + "messages": [ + [ + { + "role": "user", + "content": "What's the weather like in San Francisco?" + } + ] + ], + "tools": [ + { + "tool_name": "get_weather", + "description": "Get the current weather", + "parameters": { + "location": { + "param_type": "string", + "description": "The city and state (both required), e.g. San Francisco, CA." + } + } + } + ], + "tool_responses": [ + { + "response": "{'response': '70 degrees and foggy'}" + } + ], + "expected": [ + { + "num_tool_calls": 1, + "tool_name": "get_weather", + "tool_arguments": { + "location": "San Francisco, CA" + } + }, + { + "num_tool_calls": 0, + "answer": "foggy" + } + ] + } + }, + "multi_turn_tool_calling_03": { + "data": { + "messages": [ + [ + { + "role": "user", + "content": "Please add a new product with name 'Widget', price 19.99, in stock, and tags ['new', 'sale'] and give me the product id." + } + ] + ], + "tools": [ + { + "tool_name": "addProduct", + "description": "Get the current weather", + "parameters": { + "name": { + "param_type": "string", + "description": "Name of the product" + }, + "price": { + "param_type": "number", + "description": "Price of the product" + }, + "inStock": { + "param_type": "boolean", + "description": "Availability status of the product." + }, + "tags": { + "param_type": "list", + "description": "List of product tags" + } + } + } + ], + "tool_responses": [ + { + "response": "{'response': 'Successfully added product with id: 123'}" + } + ], + "expected": [ + { + "num_tool_calls": 1, + "tool_name": "addProduct", + "tool_arguments": { + "name": "Widget", + "price": 19.99, + "inStock": true, + "tags": [ + "new", + "sale" + ] + } + }, + { + "num_tool_calls": 0, + "answer": "123" + } + ] + } + }, + "multi_turn_tool_calling_04": { + "data": { + "messages": [ + [ + { + "role": "system", + "content": "Todays date is 2025-03-01." + }, + { + "role": "user", + "content": "Do i have any meetings on March 3rd at 10 am ?" + } + ], + [ + { + "role": "user", + "content": "Alright then, Create an event named 'Team Building', scheduled for that time same time, in the 'Main Conference Room' and add Alice, Bob, Charlie to it. Give me the created event id." + } + ] + ], + "tools": [ + { + "tool_name": "create_event", + "description": "Create a new event", + "parameters": { + "name": { + "param_type": "string", + "description": "Name of the event" + }, + "date": { + "param_type": "string", + "description": "Date of the event in ISO format" + }, + "time": { + "param_type": "string", + "description": "Event Time (HH:MM)" + }, + "location": { + "param_type": "string", + "description": "Location of the event" + }, + "participants": { + "param_type": "list", + "description": "List of participant names" + } + } + }, + { + "tool_name": "get_event", + "description": "Get an event by date and time", + "parameters": { + "date": { + "param_type": "string", + "description": "Date of the event in ISO format" + }, + "time": { + "param_type": "string", + "description": "Event Time (HH:MM)" + } + } + } + ], + "tool_responses": [ + { + "response": "{'response': 'No events found for 2025-03-03 at 10:00'}" + }, + { + "response": "{'response': 'Successfully created new event with id: e_123'}" + } + ], + "expected": [ + { + "num_tool_calls": 1, + "tool_name": "get_event", + "tool_arguments": { + "date": "2025-03-03", + "time": "10:00" + } + }, + { + "num_tool_calls": 0, + "answer": "no" + }, + { + "num_tool_calls": 1, + "tool_name": "create_event", + "tool_arguments": { + "name": "Team Building", + "date": "2025-03-03", + "time": "10:00", + "location": "Main Conference Room", + "participants": [ + "Alice", + "Bob", + "Charlie" + ] + } + }, + { + "num_tool_calls": 0, + "answer": "e_123" + } + ] + } + }, + "multi_turn_tool_calling_05": { + "data": { + "messages": [ + [ + { + "role": "system", + "content": "Todays date is 2025-03-01." + }, + { + "role": "user", + "content": "what was my monthly expense in Jan of this year?" + } + ], + [ + { + "role": "user", + "content": "Was it less than Feb of last year? Only answer with yes or no." + } + ] + ], + "tools": [ + { + "tool_name": "getMonthlyExpenseSummary", + "description": "Get monthly expense summary", + "parameters": { + "month": { + "param_type": "int", + "description": "Month of the year (1-12)" + }, + "year": { + "param_type": "int", + "description": "Year" + } + } + } + ], + "tool_responses": [ + { + "response": "{'response': 'Total expenses for January 2025: $1000'}" + }, + { + "response": "{'response': 'Total expenses for February 2024: $2000'}" + } + ], + "expected": [ + { + "num_tool_calls": 1, + "tool_name": "getMonthlyExpenseSummary", + "tool_arguments": { + "month": 1, + "year": 2025 + } + }, + { + "num_tool_calls": 0, + "answer": "1000" + }, + { + "num_tool_calls": 1, + "tool_name": "getMonthlyExpenseSummary", + "tool_arguments": { + "month": 2, + "year": 2024 + } + }, + { + "num_tool_calls": 0, + "answer": "yes" + } + ] + } + }, "sample_messages_tool_calling": { "data": { "messages": [ @@ -94,9 +451,9 @@ "description": "Get the current weather", "parameters": { "location": { - "param_type": "string", - "description": "The city and state, e.g. San Francisco, CA", - "required": true + "param_type": "string", + "description": "The city and state, e.g. San Francisco, CA", + "required": true } } } @@ -167,14 +524,14 @@ "description": "Get the list of objects in a namespace", "parameters": { "kind": { - "param_type": "string", - "description": "the type of object", - "required": true + "param_type": "string", + "description": "the type of object", + "required": true }, "namespace": { - "param_type": "string", - "description": "the name of the namespace", - "required": true + "param_type": "string", + "description": "the name of the namespace", + "required": true } } } From c1973f6528e81be63c976d79cf947ae9cc068a63 Mon Sep 17 00:00:00 2001 From: Francisco Arceo Date: Mon, 7 Apr 2025 03:58:33 -0600 Subject: [PATCH 04/24] docs: Fix typo in README.md (#1880) # What does this PR do? Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5442fe5d2..0a741f916 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ ### ✨🎉 Llama 4 Support 🎉✨ -We release [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta. +We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta. You can now run Llama 4 models on Llama Stack. From c52ccc4bbdb9844489264e9d52b07c205692e6f2 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Mon, 7 Apr 2025 06:31:04 -0400 Subject: [PATCH 05/24] docs: update importing_as_library.md (#1863) LlamaStackAsLibraryClient.initialize is not async, cannot be await'd --- docs/source/distributions/importing_as_library.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/distributions/importing_as_library.md b/docs/source/distributions/importing_as_library.md index 29a5669b3..967a18b54 100644 --- a/docs/source/distributions/importing_as_library.md +++ b/docs/source/distributions/importing_as_library.md @@ -17,7 +17,7 @@ client = LlamaStackAsLibraryClient( # provider_data is optional, but if you need to pass in any provider specific data, you can do so here. provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]}, ) -await client.initialize() +client.initialize() ``` This will parse your config and set up any inline implementations and remote clients needed for your implementation. From 530d4bdfe130ace4b31f09cb0334195928d4bc08 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 7 Apr 2025 15:03:58 -0700 Subject: [PATCH 06/24] refactor: move all llama code to models/llama out of meta reference (#1887) # What does this PR do? Move around bits. This makes the copies from llama-models _much_ easier to maintain and ensures we don't entangle meta-reference specific tidbits into llama-models code even by accident. Also, kills the meta-reference-quantized-gpu distro and rolls quantization deps into meta-reference-gpu. ## Test Plan ``` LLAMA_MODELS_DEBUG=1 \ with-proxy llama stack run meta-reference-gpu \ --env INFERENCE_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct \ --env INFERENCE_CHECKPOINT_DIR= \ --env MODEL_PARALLEL_SIZE=4 \ --env QUANTIZATION_TYPE=fp8_mixed ``` Start a server with and without quantization. Point integration tests to it using: ``` pytest -s -v tests/integration/inference/test_text_inference.py \ --stack-config http://localhost:8321 --text-model meta-llama/Llama-4-Scout-17B-16E-Instruct ``` --- llama_stack/apis/inference/inference.py | 63 ++- llama_stack/cli/download.py | 2 +- llama_stack/cli/model/describe.py | 11 - llama_stack/cli/model/prompt_format.py | 2 +- llama_stack/cli/model/safety_models.py | 5 +- llama_stack/models/llama/checkpoint.py | 164 ++++++++ llama_stack/models/llama/datatypes.py | 339 ++-------------- .../llama}/hadamard_utils.py | 0 .../llama}/llama3/args.py | 7 - .../models/llama/llama3/chat_format.py | 10 +- llama_stack/models/llama/llama3/generation.py | 367 ++++++++++++++++++ llama_stack/models/llama/llama3/interface.py | 3 +- .../llama}/llama3/model.py | 20 +- .../llama}/llama3/multimodal/__init__.py | 0 .../llama}/llama3/multimodal/encoder_utils.py | 0 .../llama3/multimodal/image_transform.py | 0 .../llama}/llama3/multimodal/model.py | 73 ++-- .../llama}/llama3/multimodal/utils.py | 0 .../llama3/prompt_templates/system_prompts.py | 2 +- .../llama/llama3/quantization}/__init__.py | 2 - .../llama}/llama3/quantization/loader.py | 73 ++-- .../models/llama/llama3/template_data.py | 3 +- llama_stack/models/llama/llama3/tokenizer.py | 10 - llama_stack/models/llama/llama3/tool_utils.py | 3 +- llama_stack/models/llama/llama3_2/__init__.py | 7 - .../models/llama/llama3_2/prompts_text.py | 6 - .../models/llama/llama3_2/prompts_vision.py | 7 - .../llama}/llama4/args.py | 7 - .../models/llama/llama4/chat_format.py | 20 +- .../llama}/llama4/datatypes.py | 7 - .../llama}/llama4/ffn.py | 0 .../llama}/llama4/generation.py | 153 ++++---- .../llama}/llama4/model.py | 11 - .../llama}/llama4/moe.py | 16 +- .../llama}/llama4/preprocess.py | 0 llama_stack/models/llama/llama4/prompts.py | 11 +- .../llama/llama4/quantization/__init__.py | 5 + .../llama}/llama4/quantization/loader.py | 98 +++-- llama_stack/models/llama/llama4/tokenizer.py | 25 +- .../llama}/llama4/vision/embedding.py | 7 - .../llama}/llama4/vision/encoder.py | 0 llama_stack/models/llama/prompt_format.py | 58 ++- .../llama}/quantize_impls.py | 0 llama_stack/models/llama/sku_list.py | 55 +-- llama_stack/models/llama/sku_types.py | 229 +++++++++++ .../agents/meta_reference/agent_instance.py | 2 +- .../inference/meta_reference/__init__.py | 6 +- .../inline/inference/meta_reference/common.py | 9 - .../inline/inference/meta_reference/config.py | 26 +- .../inference/meta_reference/generators.py | 93 +++-- .../inference/meta_reference/inference.py | 12 +- .../meta_reference/llama3/generation.py | 346 ----------------- .../meta_reference/parallel_utils.py | 5 +- .../inline/inference/vllm/openai_utils.py | 3 +- .../providers/inline/inference/vllm/vllm.py | 4 +- .../post_training/torchtune/common/utils.py | 2 +- .../inline/safety/llama_guard/llama_guard.py | 3 +- llama_stack/providers/registry/inference.py | 9 +- .../remote/inference/bedrock/models.py | 2 +- .../remote/inference/cerebras/cerebras.py | 2 +- .../remote/inference/cerebras/models.py | 2 +- .../remote/inference/databricks/databricks.py | 2 +- .../remote/inference/fireworks/models.py | 2 +- .../remote/inference/nvidia/models.py | 2 +- .../remote/inference/nvidia/nvidia.py | 6 +- .../remote/inference/nvidia/openai_utils.py | 4 +- .../remote/inference/ollama/models.py | 2 +- .../remote/inference/sambanova/models.py | 2 +- .../remote/inference/sambanova/sambanova.py | 6 +- .../remote/inference/together/models.py | 2 +- .../remote/post_training/nvidia/models.py | 2 +- llama_stack/providers/tests/report.py | 2 +- .../providers/utils/inference/__init__.py | 2 +- .../utils/inference/openai_compat.py | 8 +- .../utils/inference/prompt_adapter.py | 3 +- llama_stack/templates/dependencies.json | 45 +-- .../meta-reference-gpu/run-with-safety.yaml | 6 + .../templates/meta-reference-gpu/run.yaml | 3 + .../meta-reference-quantized-gpu/build.yaml | 32 -- .../doc_template.md | 113 ------ .../meta_reference.py | 115 ------ .../meta-reference-quantized-gpu/run.yaml | 134 ------- pyproject.toml | 6 +- scripts/generate_prompt_format.py | 42 +- tests/integration/report.py | 2 +- 85 files changed, 1267 insertions(+), 1683 deletions(-) create mode 100644 llama_stack/models/llama/checkpoint.py rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/hadamard_utils.py (100%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/args.py (88%) create mode 100644 llama_stack/models/llama/llama3/generation.py rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/model.py (94%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/multimodal/__init__.py (100%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/multimodal/encoder_utils.py (100%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/multimodal/image_transform.py (100%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/multimodal/model.py (95%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/multimodal/utils.py (100%) rename llama_stack/{templates/meta-reference-quantized-gpu => models/llama/llama3/quantization}/__init__.py (74%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama3/quantization/loader.py (84%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/args.py (91%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/datatypes.py (85%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/ffn.py (100%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/generation.py (72%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/model.py (97%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/moe.py (87%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/preprocess.py (100%) create mode 100644 llama_stack/models/llama/llama4/quantization/__init__.py rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/quantization/loader.py (70%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/vision/embedding.py (96%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/llama4/vision/encoder.py (100%) rename llama_stack/{providers/inline/inference/meta_reference => models/llama}/quantize_impls.py (100%) create mode 100644 llama_stack/models/llama/sku_types.py delete mode 100644 llama_stack/providers/inline/inference/meta_reference/llama3/generation.py delete mode 100644 llama_stack/templates/meta-reference-quantized-gpu/build.yaml delete mode 100644 llama_stack/templates/meta-reference-quantized-gpu/doc_template.md delete mode 100644 llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py delete mode 100644 llama_stack/templates/meta-reference-quantized-gpu/run.yaml diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py index 1d4012c19..e59132e33 100644 --- a/llama_stack/apis/inference/inference.py +++ b/llama_stack/apis/inference/inference.py @@ -25,15 +25,64 @@ from llama_stack.apis.models import Model from llama_stack.apis.telemetry.telemetry import MetricResponseMixin from llama_stack.models.llama.datatypes import ( BuiltinTool, - SamplingParams, StopReason, ToolCall, ToolDefinition, + ToolParamDefinition, ToolPromptFormat, ) from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.schema_utils import json_schema_type, register_schema, webmethod +register_schema(ToolCall) +register_schema(ToolParamDefinition) +register_schema(ToolDefinition) + + +@json_schema_type +class GreedySamplingStrategy(BaseModel): + type: Literal["greedy"] = "greedy" + + +@json_schema_type +class TopPSamplingStrategy(BaseModel): + type: Literal["top_p"] = "top_p" + temperature: Optional[float] = Field(..., gt=0.0) + top_p: Optional[float] = 0.95 + + +@json_schema_type +class TopKSamplingStrategy(BaseModel): + type: Literal["top_k"] = "top_k" + top_k: int = Field(..., ge=1) + + +SamplingStrategy = Annotated[ + Union[GreedySamplingStrategy, TopPSamplingStrategy, TopKSamplingStrategy], + Field(discriminator="type"), +] +register_schema(SamplingStrategy, name="SamplingStrategy") + + +@json_schema_type +class SamplingParams(BaseModel): + """Sampling parameters. + + :param strategy: The sampling strategy. + :param max_tokens: The maximum number of tokens that can be generated in the completion. The token count of + your prompt plus max_tokens cannot exceed the model's context length. + :param repetition_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens + based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + :param stop: Up to 4 sequences where the API will stop generating further tokens. + The returned text will not contain the stop sequence. + """ + + strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy) + + max_tokens: Optional[int] = 0 + repetition_penalty: Optional[float] = 1.0 + stop: Optional[List[str]] = None + class LogProbConfig(BaseModel): """ @@ -48,18 +97,18 @@ class QuantizationType(Enum): """Type of model quantization to run inference with. :cvar bf16: BFloat16 typically this means _no_ quantization - :cvar fp8: 8-bit floating point quantization - :cvar int4: 4-bit integer quantization + :cvar fp8_mixed: 8-bit floating point quantization with mixed precision + :cvar int4_mixed: 4-bit integer quantization with mixed precision """ bf16 = "bf16" - fp8 = "fp8" - int4 = "int4" + fp8_mixed = "fp8_mixed" + int4_mixed = "int4_mixed" @json_schema_type class Fp8QuantizationConfig(BaseModel): - type: Literal["fp8"] = "fp8" + type: Literal["fp8_mixed"] = "fp8_mixed" @json_schema_type @@ -75,7 +124,7 @@ class Int4QuantizationConfig(BaseModel): :param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation" """ - type: Literal["int4"] = "int4" + type: Literal["int4_mixed"] = "int4_mixed" scheme: Optional[str] = "int4_weight_int8_dynamic_activation" diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py index fc3e7008f..9694bf22d 100644 --- a/llama_stack/cli/download.py +++ b/llama_stack/cli/download.py @@ -29,8 +29,8 @@ from rich.progress import ( from termcolor import cprint from llama_stack.cli.subcommand import Subcommand -from llama_stack.models.llama.datatypes import Model from llama_stack.models.llama.sku_list import LlamaDownloadInfo +from llama_stack.models.llama.sku_types import Model class Download(Subcommand): diff --git a/llama_stack/cli/model/describe.py b/llama_stack/cli/model/describe.py index f347bdf8d..62dde36e8 100644 --- a/llama_stack/cli/model/describe.py +++ b/llama_stack/cli/model/describe.py @@ -63,17 +63,6 @@ class ModelDescribe(Subcommand): ("Model params.json", json.dumps(model.arch_args, indent=4)), ] - if model.recommended_sampling_params is not None: - sampling_params = model.recommended_sampling_params.model_dump() - for k in ("max_tokens", "repetition_penalty"): - del sampling_params[k] - rows.append( - ( - "Recommended sampling params", - json.dumps(sampling_params, indent=4), - ) - ) - print_table( rows, headers, diff --git a/llama_stack/cli/model/prompt_format.py b/llama_stack/cli/model/prompt_format.py index 3ce77655b..673487812 100644 --- a/llama_stack/cli/model/prompt_format.py +++ b/llama_stack/cli/model/prompt_format.py @@ -11,7 +11,7 @@ from pathlib import Path from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.table import print_table -from llama_stack.models.llama.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family +from llama_stack.models.llama.sku_types import CoreModelId, ModelFamily, is_multimodal, model_family ROOT_DIR = Path(__file__).parent.parent.parent diff --git a/llama_stack/cli/model/safety_models.py b/llama_stack/cli/model/safety_models.py index c81783f60..131d055aa 100644 --- a/llama_stack/cli/model/safety_models.py +++ b/llama_stack/cli/model/safety_models.py @@ -4,12 +4,12 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, Dict, Optional +from typing import Any, Dict from pydantic import BaseModel, ConfigDict, Field -from llama_stack.models.llama.datatypes import CheckpointQuantizationFormat, SamplingParams from llama_stack.models.llama.sku_list import LlamaDownloadInfo +from llama_stack.models.llama.sku_types import CheckpointQuantizationFormat class PromptGuardModel(BaseModel): @@ -23,7 +23,6 @@ class PromptGuardModel(BaseModel): is_instruct_model: bool = False quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16 arch_args: Dict[str, Any] = Field(default_factory=dict) - recommended_sampling_params: Optional[SamplingParams] = None def descriptor(self) -> str: return self.model_id diff --git a/llama_stack/models/llama/checkpoint.py b/llama_stack/models/llama/checkpoint.py new file mode 100644 index 000000000..2bae08a69 --- /dev/null +++ b/llama_stack/models/llama/checkpoint.py @@ -0,0 +1,164 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import concurrent.futures +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import torch +from fairscale.nn.model_parallel.initialize import get_model_parallel_rank, get_model_parallel_world_size + + +def map_mp_rank(old_mp_size: int, new_mp_size: int, new_mp_rank: int) -> List[int]: + """Map a new MP rank to a list of old MP ranks given a change in MP size.""" + if new_mp_size % old_mp_size == 0: + # Read old MP shard and split it into smaller ones + return [new_mp_rank * old_mp_size // new_mp_size] + elif old_mp_size % new_mp_size == 0: + # Merge old MP shards into a single one + mp_factor = old_mp_size // new_mp_size + return list(range(new_mp_rank * mp_factor, (new_mp_rank + 1) * mp_factor)) + else: + raise ValueError( + f"Either old MP size or new MP size should be a multiple of the other: " + f"{old_mp_size} % {new_mp_size} != 0 and {new_mp_size} % {old_mp_size} != 0" + ) + + +def maybe_reshard_state_dict( + ckpt_paths: List[Path], + n_kv_heads: int, + moe_num_experts: Optional[int] = None, + map_location: Union[str, torch.device] = "cpu", + mmap: bool = True, +) -> Dict[str, torch.Tensor]: + if str(map_location) == "cpu": + torch.set_default_tensor_type(torch.BFloat16Tensor) + else: + torch.set_default_tensor_type(torch.cuda.BFloat16Tensor) + + ckpt_paths = np.array(sorted(ckpt_paths)) + + new_mp_size, new_mp_rank = get_model_parallel_world_size(), get_model_parallel_rank() + old_mp_size = len(ckpt_paths) + old_mp_ranks = map_mp_rank(old_mp_size, new_mp_size, new_mp_rank) + + print(f"Loading checkpoint shards:\n{str(ckpt_paths[old_mp_ranks])}") # type: ignore + paths = ckpt_paths[old_mp_ranks] # type: ignore + state_dicts = [torch.load(str(p), map_location=map_location, mmap=mmap) for p in paths] + + if new_mp_size == old_mp_size: + return state_dicts[0] # type: ignore + + if moe_num_experts is not None: + state_dicts = [convert_moe_weights(d, moe_num_experts) for d in state_dicts] + + print(f"Resharding {len(state_dicts)} state dicts from MP size {old_mp_size} to MP size {new_mp_size}") + return reshard_mp( + state_dicts, + size=max(new_mp_size // old_mp_size, 1), + rank=new_mp_rank % max(new_mp_size // old_mp_size, 1), + repeat_qk_qv=max(new_mp_size // n_kv_heads, 1), + ) + + +_WEIGHT_ROW_KEY = { + "feed_forward.w2", + "feed_forward.mlp.fc2", + "attention.wo", + "feed_forward.mlp.fc2_weight", + "feed_forward.w_out_shared_DF.weight", + "attn.wo.weight", + "mlp.c_proj.weight", +} +_MOE_WEIGHT_ROW_KEY = {"feed_forward.experts.(moe_w_in_eD_F|moe_w_swiglu_eD_F)"} + +_WEIGHT_COLUMN_KEY = { + "output", + "feed_forward.(w1|w3)", + "feed_forward.mlp.(fc1|fc3)", + "feed_forward.mlp.fc1_weight", + "attention.(wk|wq|wv|wqkv).weight", + "feed_forward.(w_in_shared_FD|w_swiglu_FD)", + "attn.(wk|wq|wv).weight", + "attn.(wk|wq|wv).bias", + "mlp.c_fc.weight", + "mlp.c_fc.bias", + "conv1._linear.weight", + "tok_embeddings.weight", + "vision_projection.weight", +} +_MOE_WEIGHT_COLUMN_KEY = {"feed_forward.experts.moe_w_out_eF_D"} + + +def reshard_mp( + state_dicts: List[Dict[str, torch.Tensor]], + size: int, + rank: int, + repeat_qk_qv: int = 1, +) -> Dict[str, torch.Tensor]: + """ + Reshard a list of state dicts into a single state dict given a change in MP size. + If the list has more than one state dict, we concatenate the values of the same + key across all state dicts. Otherwise, we just slice it for the current MP rank. + """ + + def concat_or_chunk(tensors: List[torch.Tensor], dim: int) -> torch.Tensor: + if len(tensors) > 1: + return torch.cat(tensors, dim=dim) + return tensors[0].chunk(size, dim=dim)[rank].clone() + + def process_key(key: str) -> torch.Tensor: + if row_regex.search(key): + return concat_or_chunk([s[key] for s in state_dicts], dim=-1) + elif column_regex.search(key): + if "w13" in key or "fc1_weight" in key: + dims = state_dicts[0][key].size() + values = [s[key].view(2, dims[0] // 2, *dims[1:]) for s in state_dicts] + return concat_or_chunk(values, dim=1).flatten(0, 1) + elif "qkv" in key: + q_dim = state_dicts[0][key.replace("qkv", "o")].size(1) + kv_dim = (state_dicts[0][key].size(0) - q_dim) // 2 + values = [s[key].split((q_dim, kv_dim, kv_dim)) for s in state_dicts] + return torch.cat([concat_or_chunk(x, dim=0) for x in zip(*values, strict=False)]) # type: ignore + elif "wk.weight" in key or "wv.weight" in key: + # Support MP > #kv_head + return concat_or_chunk([s[key].repeat(repeat_qk_qv, 1) for s in state_dicts], dim=0) + elif key == "output.bias" or key == "fc.weight": + return concat_or_chunk([s[key] for s in state_dicts], dim=0) + elif "w_" in key: + return concat_or_chunk([s[key] for s in state_dicts], dim=-2) + else: + return concat_or_chunk([s[key] for s in state_dicts], dim=0) + else: + return state_dicts[0][key].clone() + + row_keys = _WEIGHT_ROW_KEY | _MOE_WEIGHT_ROW_KEY + column_keys = _WEIGHT_COLUMN_KEY | _MOE_WEIGHT_COLUMN_KEY + + column_regex = re.compile("|".join(column_keys)) + row_regex = re.compile("|".join(row_keys)) + + output: Dict[str, torch.Tensor] = {} + with concurrent.futures.ThreadPoolExecutor() as executor: + # Note: only processes keys in the first state dict. + # Assumes keys are the same across all state dicts. + mappings = {executor.submit(process_key, key): key for key in state_dicts[0]} + for future in concurrent.futures.as_completed(mappings): + output[mappings[future]] = future.result() + return output + + +def convert_moe_weights(state_dict: Dict[str, Any], num_experts: int) -> Dict[str, Any]: + routed_keys = _MOE_WEIGHT_ROW_KEY | _MOE_WEIGHT_COLUMN_KEY + routed_regex = re.compile("|".join(routed_keys)) + keys = list(state_dict.keys()) + for key in keys: + if routed_regex.search(key): + state_dict[key] = state_dict.pop(key).unflatten(0, (num_experts, -1)).squeeze(dim=0) + return state_dict diff --git a/llama_stack/models/llama/datatypes.py b/llama_stack/models/llama/datatypes.py index ef791da8f..48cb51005 100644 --- a/llama_stack/models/llama/datatypes.py +++ b/llama_stack/models/llama/datatypes.py @@ -4,13 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - import base64 from enum import Enum from io import BytesIO @@ -19,8 +12,6 @@ from typing import Any, Dict, List, Literal, Optional, Union from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator from typing_extensions import Annotated -from llama_stack.schema_utils import json_schema_type, register_schema - # The goal is that these set of types are relevant for all Llama models. # That isn't the current state yet -- e.g., BuiltinTool is somewhat specific to # the llama3 series of models. @@ -98,6 +89,29 @@ class StopReason(Enum): out_of_tokens = "out_of_tokens" +class ToolParamDefinition(BaseModel): + param_type: str + description: Optional[str] = None + required: Optional[bool] = True + default: Optional[Any] = None + + +class ToolDefinition(BaseModel): + tool_name: Union[BuiltinTool, str] + description: Optional[str] = None + parameters: Optional[Dict[str, ToolParamDefinition]] = None + + @field_validator("tool_name", mode="before") + @classmethod + def validate_field(cls, v): + if isinstance(v, str): + try: + return BuiltinTool(v) + except ValueError: + return v + return v + + class RawMediaItem(BaseModel): type: Literal["image"] = "image" data: bytes | BytesIO @@ -140,292 +154,25 @@ class RawMessage(BaseModel): tool_calls: List[ToolCall] = Field(default_factory=list) -register_schema(ToolCall) +class GenerationResult(BaseModel): + token: int + text: str + logprobs: Optional[List[float]] = None + + source: Literal["input"] | Literal["output"] + + # index within the batch + batch_idx: int + # whether generation for this item is already finished. note that tokens can + # get returned even afterwards since other items in the batch can still be generating tokens + finished: bool + # because a batch is parallel processed, useful decoding for one item can correspond to processing + # pad tokens or tokens beyond EOS for other items. we could have decided to return None for this case + # but it's more convenient to return a list of GenerationResult and filter out the ignored tokens + ignore_token: bool -@json_schema_type -class ToolParamDefinition(BaseModel): - param_type: str - description: Optional[str] = None - required: Optional[bool] = True - default: Optional[Any] = None - - -@json_schema_type -class ToolDefinition(BaseModel): - tool_name: Union[BuiltinTool, str] - description: Optional[str] = None - parameters: Optional[Dict[str, ToolParamDefinition]] = None - - @field_validator("tool_name", mode="before") - @classmethod - def validate_field(cls, v): - if isinstance(v, str): - try: - return BuiltinTool(v) - except ValueError: - return v - return v - - -@json_schema_type -class GreedySamplingStrategy(BaseModel): - type: Literal["greedy"] = "greedy" - - -@json_schema_type -class TopPSamplingStrategy(BaseModel): - type: Literal["top_p"] = "top_p" - temperature: Optional[float] = Field(..., gt=0.0) - top_p: Optional[float] = 0.95 - - -@json_schema_type -class TopKSamplingStrategy(BaseModel): - type: Literal["top_k"] = "top_k" - top_k: int = Field(..., ge=1) - - -SamplingStrategy = Annotated[ - Union[GreedySamplingStrategy, TopPSamplingStrategy, TopKSamplingStrategy], - Field(discriminator="type"), -] -register_schema(SamplingStrategy, name="SamplingStrategy") - - -@json_schema_type -class SamplingParams(BaseModel): - """Sampling parameters. - - :param strategy: The sampling strategy. - :param max_tokens: The maximum number of tokens that can be generated in the completion. The token count of - your prompt plus max_tokens cannot exceed the model's context length. - :param repetition_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens - based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. - :param stop: Up to 4 sequences where the API will stop generating further tokens. - The returned text will not contain the stop sequence. - """ - - strategy: SamplingStrategy = Field(default_factory=GreedySamplingStrategy) - - max_tokens: Optional[int] = 0 - repetition_penalty: Optional[float] = 1.0 - stop: Optional[List[str]] = None - - -class CheckpointQuantizationFormat(Enum): - # default format - bf16 = "bf16" - - # used for enabling fp8_rowwise inference, some weights are bf16 - fp8_mixed = "fp8-mixed" - - int8 = "int8" - - int4 = "int4" - - -class ModelFamily(Enum): - llama2 = "llama2" - llama3 = "llama3" - llama3_1 = "llama3_1" - llama3_2 = "llama3_2" - llama3_3 = "llama3_3" - llama4 = "llama4" - safety = "safety" - - -class CoreModelId(Enum): - """Each of these models is a unique "SKU". These root models can be served in various garbs (especially by quantizing them)""" - - # Llama 2 family - llama2_7b = "Llama-2-7b" - llama2_13b = "Llama-2-13b" - llama2_70b = "Llama-2-70b" - llama2_7b_chat = "Llama-2-7b-chat" - llama2_13b_chat = "Llama-2-13b-chat" - llama2_70b_chat = "Llama-2-70b-chat" - - # Llama 3 family - llama3_8b = "Llama-3-8B" - llama3_70b = "Llama-3-70B" - llama3_8b_instruct = "Llama-3-8B-Instruct" - llama3_70b_instruct = "Llama-3-70B-Instruct" - - # Llama 3.1 family - llama3_1_8b = "Llama3.1-8B" - llama3_1_70b = "Llama3.1-70B" - llama3_1_405b = "Llama3.1-405B" - llama3_1_8b_instruct = "Llama3.1-8B-Instruct" - llama3_1_70b_instruct = "Llama3.1-70B-Instruct" - llama3_1_405b_instruct = "Llama3.1-405B-Instruct" - - # Llama 3.2 family - llama3_2_1b = "Llama3.2-1B" - llama3_2_3b = "Llama3.2-3B" - llama3_2_1b_instruct = "Llama3.2-1B-Instruct" - llama3_2_3b_instruct = "Llama3.2-3B-Instruct" - llama3_2_11b_vision = "Llama3.2-11B-Vision" - llama3_2_90b_vision = "Llama3.2-90B-Vision" - llama3_2_11b_vision_instruct = "Llama3.2-11B-Vision-Instruct" - llama3_2_90b_vision_instruct = "Llama3.2-90B-Vision-Instruct" - - # Llama 3.3 family - llama3_3_70b_instruct = "Llama3.3-70B-Instruct" - - # Llama 4 family - llama4_scout_17b_16e = "Llama-4-Scout-17B-16E" - llama4_scout_17b_16e_instruct = "Llama-4-Scout-17B-16E-Instruct" - llama4_maverick_17b_128e = "Llama-4-Maverick-17B-128E" - llama4_maverick_17b_128e_instruct = "Llama-4-Maverick-17B-128E-Instruct" - - # Safety models - llama_guard_3_8b = "Llama-Guard-3-8B" - llama_guard_2_8b = "Llama-Guard-2-8B" - llama_guard_3_11b_vision = "Llama-Guard-3-11B-Vision" - llama_guard_3_1b = "Llama-Guard-3-1B" - - -def is_multimodal(model_id) -> bool: - if model_id in [ - CoreModelId.llama3_2_11b_vision, - CoreModelId.llama3_2_90b_vision, - CoreModelId.llama3_2_11b_vision_instruct, - CoreModelId.llama3_2_90b_vision_instruct, - ]: - return True - else: - return False - - -def model_family(model_id) -> ModelFamily: - if model_id in [ - CoreModelId.llama2_7b, - CoreModelId.llama2_13b, - CoreModelId.llama2_70b, - CoreModelId.llama2_7b_chat, - CoreModelId.llama2_13b_chat, - CoreModelId.llama2_70b_chat, - ]: - return ModelFamily.llama2 - elif model_id in [ - CoreModelId.llama3_8b, - CoreModelId.llama3_70b, - CoreModelId.llama3_8b_instruct, - CoreModelId.llama3_70b_instruct, - ]: - return ModelFamily.llama3 - elif model_id in [ - CoreModelId.llama3_1_8b, - CoreModelId.llama3_1_70b, - CoreModelId.llama3_1_405b, - CoreModelId.llama3_1_8b_instruct, - CoreModelId.llama3_1_70b_instruct, - CoreModelId.llama3_1_405b_instruct, - ]: - return ModelFamily.llama3_1 - elif model_id in [ - CoreModelId.llama3_2_1b, - CoreModelId.llama3_2_3b, - CoreModelId.llama3_2_1b_instruct, - CoreModelId.llama3_2_3b_instruct, - CoreModelId.llama3_2_11b_vision, - CoreModelId.llama3_2_90b_vision, - CoreModelId.llama3_2_11b_vision_instruct, - CoreModelId.llama3_2_90b_vision_instruct, - ]: - return ModelFamily.llama3_2 - elif model_id in [ - CoreModelId.llama3_3_70b_instruct, - ]: - return ModelFamily.llama3_3 - elif model_id in [ - CoreModelId.llama4_scout_17b_16e, - CoreModelId.llama4_scout_17b_16e_instruct, - CoreModelId.llama4_maverick_17b_128e, - CoreModelId.llama4_maverick_17b_128e_instruct, - ]: - return ModelFamily.llama4 - elif model_id in [ - CoreModelId.llama_guard_3_8b, - CoreModelId.llama_guard_2_8b, - CoreModelId.llama_guard_3_11b_vision, - CoreModelId.llama_guard_3_1b, - ]: - return ModelFamily.safety - else: - raise ValueError(f"Unknown model family for {model_id}") - - -class Model(BaseModel): - core_model_id: CoreModelId - description: str - huggingface_repo: Optional[str] = None - recommended_sampling_params: Optional[SamplingParams] = None - arch_args: Dict[str, Any] - variant: str = "" - - quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16 - pth_file_count: int - metadata: Optional[Dict[str, Any]] = Field(default_factory=dict) - - # silence pydantic until we remove the `model_` fields - model_config = ConfigDict(protected_namespaces=()) - - @property - def model_family(self) -> ModelFamily: - return model_family(self.core_model_id) - - # The SKU is uniquely identified by (model_id, variant) combo - def descriptor(self, shorten_default_variant: bool = True) -> str: - if not self.variant: - return self.core_model_id.value - return f"{self.core_model_id.value}:{self.variant}" - - @property - def is_instruct_model(self) -> bool: - return "instruct" in self.id.name - - # Featured models are shown in the non-exhaustive model list - @property - def is_featured(self) -> bool: - return self.model_family in [ - ModelFamily.llama3_1, - ModelFamily.llama3_2, - ModelFamily.llama3_3, - ModelFamily.llama4, - ModelFamily.safety, - ] - - @property - def max_seq_length(self) -> int: - if self.model_family == ModelFamily.llama2: - return 4096 - elif self.core_model_id == CoreModelId.llama_guard_2_8b: - return 4096 - elif self.model_family == ModelFamily.llama3: - return 8192 - elif self.model_family in [ModelFamily.llama3_1, ModelFamily.llama3_3]: - return 131072 - elif self.model_family == ModelFamily.llama3_2: - if self.quantization_format == CheckpointQuantizationFormat.int4: - return 8192 - return 131072 - elif self.model_family == ModelFamily.llama4: - if self.core_model_id in { - CoreModelId.llama4_scout_17b_16e, - CoreModelId.llama4_maverick_17b_128e, - }: - return 262144 - if self.core_model_id == CoreModelId.llama4_scout_17b_16e_instruct: - return 10485760 - if self.core_model_id == CoreModelId.llama4_maverick_17b_128e_instruct: - return 1048576 - elif self.core_model_id in [ - CoreModelId.llama_guard_3_8b, - CoreModelId.llama_guard_3_11b_vision, - CoreModelId.llama_guard_3_1b, - ]: - return 131072 - else: - raise ValueError(f"Unknown max_seq_len for {self.core_model_id}") +class QuantizationMode(str, Enum): + none = "none" + fp8_mixed = "fp8_mixed" + int4_mixed = "int4_mixed" diff --git a/llama_stack/providers/inline/inference/meta_reference/hadamard_utils.py b/llama_stack/models/llama/hadamard_utils.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/hadamard_utils.py rename to llama_stack/models/llama/hadamard_utils.py diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/args.py b/llama_stack/models/llama/llama3/args.py similarity index 88% rename from llama_stack/providers/inline/inference/meta_reference/llama3/args.py rename to llama_stack/models/llama/llama3/args.py index e96eaca61..f7e4b4557 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama3/args.py +++ b/llama_stack/models/llama/llama3/args.py @@ -4,13 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - from dataclasses import dataclass from enum import Enum from typing import Optional diff --git a/llama_stack/models/llama/llama3/chat_format.py b/llama_stack/models/llama/llama3/chat_format.py index 2862f8558..f55cd5e1c 100644 --- a/llama_stack/models/llama/llama3/chat_format.py +++ b/llama_stack/models/llama/llama3/chat_format.py @@ -4,13 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - import io import json import uuid @@ -19,7 +12,7 @@ from typing import Dict, List, Optional, Tuple from PIL import Image as PIL_Image -from llama_stack.models.llama.datatypes import ( +from ..datatypes import ( BuiltinTool, RawContent, RawMediaItem, @@ -30,7 +23,6 @@ from llama_stack.models.llama.datatypes import ( ToolCall, ToolPromptFormat, ) - from .tokenizer import Tokenizer from .tool_utils import ToolUtils diff --git a/llama_stack/models/llama/llama3/generation.py b/llama_stack/models/llama/llama3/generation.py new file mode 100644 index 000000000..ee99a07ba --- /dev/null +++ b/llama_stack/models/llama/llama3/generation.py @@ -0,0 +1,367 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# top-level folder for each specific model found within the models/ directory at +# the top-level of this source tree. + +import json +import os +import sys +import time +from pathlib import Path +from typing import Callable, Generator, List, Optional + +import torch +import torch.nn.functional as F +from fairscale.nn.model_parallel.initialize import ( + initialize_model_parallel, + model_parallel_is_initialized, +) +from termcolor import cprint + +from ..checkpoint import maybe_reshard_state_dict +from ..datatypes import GenerationResult, QuantizationMode, RawContent, RawMessage, ToolPromptFormat +from .args import ModelArgs +from .chat_format import ChatFormat, LLMInput +from .model import Transformer +from .multimodal.model import CrossAttentionTransformer +from .tokenizer import Tokenizer + + +class Llama3: + @staticmethod + def build( + ckpt_dir: str, + max_seq_len: int, + max_batch_size: int, + world_size: Optional[int] = None, + quantization_mode: Optional[QuantizationMode] = None, + seed: int = 1, + device: str = "cuda", + ): + device = torch.device(device) + if ( + device.type == "cuda" + and not torch.cuda.is_available() + or device.type == "xpu" + and not torch.xpu.is_available() + ): + raise RuntimeError(f"PyTorch backend for {device.type} device type is not available") + + if not torch.distributed.is_initialized(): + if device.type == "cuda": + torch.distributed.init_process_group("nccl") + else: + torch.distributed.init_process_group("gloo") + + if not model_parallel_is_initialized(): + if world_size is None: + world_size = int(os.environ.get("WORLD_SIZE", 1)) + initialize_model_parallel(world_size) + + local_rank = int(os.environ.get("LOCAL_RANK", 0)) + if device.type == "cuda": + torch.cuda.set_device(local_rank) + elif device.type == "xpu": + torch.xpu.set_device(local_rank) + + torch.manual_seed(seed) + + if local_rank > 0: + sys.stdout = open(os.devnull, "w") + + start_time = time.time() + + ckpt_paths = sorted(Path(ckpt_dir).glob("*.pth")) + assert len(ckpt_paths) > 0, f"no checkpoint files found in {ckpt_dir}" + print(f"Loading a checkpoint (shards={len(ckpt_paths)}, current-mp-size={world_size})") + with open(Path(ckpt_dir) / "params.json", "r") as f: + params = json.loads(f.read()) + + model_args: ModelArgs = ModelArgs( + max_seq_len=max_seq_len, + max_batch_size=max_batch_size, + **params, + ) + tokenizer = Tokenizer.get_instance() + + state_dict = maybe_reshard_state_dict( + ckpt_paths, + n_kv_heads=model_args.n_kv_heads if model_args.n_kv_heads else model_args.n_heads, + ) + + assert model_args.vocab_size == tokenizer.n_words + + def build_model(): + if model_args.vision_chunk_size > 0: + model = CrossAttentionTransformer(model_args) + model.setup_cache(model_args.max_batch_size, device=device, dtype=torch.get_default_dtype()) + else: + model = Transformer(model_args) + return model + + if quantization_mode == QuantizationMode.fp8_mixed or quantization_mode == QuantizationMode.int4_mixed: + from .quantization.loader import convert_to_quantized_model + + torch.set_default_tensor_type(torch.BFloat16Tensor) + model = build_model() + print("Loading state dict...") + model.load_state_dict(state_dict, strict=False) + print("Done...") + model = convert_to_quantized_model(model, ckpt_dir, quantization_mode, device=device) + torch.set_default_device(device) + else: + print(f"Setting default device to {device}") + torch.set_default_device(device) + if device.type == "cuda": + if torch.cuda.is_bf16_supported(): + torch.set_default_dtype(torch.bfloat16) + else: + torch.set_default_dtype(torch.half) + elif device.type == "xpu": + if torch.xpu.is_bf16_supported(): + torch.set_default_dtype(torch.bfloat16) + else: + torch.set_default_dtype(torch.half) + + model = build_model() + print("Loading state dict...") + model.load_state_dict(state_dict, strict=True) + model.to(device) + print("Done...") + + print(f"Loaded in {time.time() - start_time:.2f} seconds") + + return Llama3(model, tokenizer, model_args) + + def __init__(self, model: Transformer | CrossAttentionTransformer, tokenizer: Tokenizer, args: ModelArgs): + self.args = args + self.model = model + self.tokenizer = tokenizer + self.formatter = ChatFormat(tokenizer) + + @torch.inference_mode() + def generate( + self, + model_inputs: List[LLMInput], + temperature: float = 0.6, + top_p: float = 0.9, + max_gen_len: Optional[int] = None, + logprobs: bool = False, + echo: bool = False, + print_model_input: bool = False, + logits_processor: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + ) -> Generator[List[GenerationResult], None, None]: + if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len: + max_gen_len = self.args.max_seq_len - 1 + params = self.model.params + + print_model_input = print_model_input or os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1" + if print_model_input: + for inp in model_inputs: + tokens_to_print = [self.formatter.vision_token if t == 128256 else t for t in inp.tokens] + cprint( + "Input to model:\n" + self.tokenizer.decode(tokens_to_print) + "\n", + "red", + ) + prompt_tokens = [inp.tokens for inp in model_inputs] + + bsz = len(model_inputs) + assert bsz <= params.max_batch_size, (bsz, params.max_batch_size) + + min_prompt_len = min(len(t) for t in prompt_tokens) + max_prompt_len = max(len(t) for t in prompt_tokens) + + if max_prompt_len >= params.max_seq_len: + cprint(f"Out of token budget {max_prompt_len} vs {params.max_seq_len}", "red") + return + + total_len = min(max_gen_len + max_prompt_len, params.max_seq_len) + + pad_id = self.tokenizer.pad_id + tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long) + for k, t in enumerate(prompt_tokens): + tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long) + if logprobs: + token_logprobs = torch.zeros_like(tokens, dtype=torch.float) + + is_vision = not isinstance(self.model, Transformer) + if is_vision: + images = [inp.vision.images if inp.vision is not None else [] for inp in model_inputs] + mask = [inp.vision.mask if inp.vision is not None else [] for inp in model_inputs] + + xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = self.model.compute_vision_tokens_masks( + batch_images=images, + batch_masks=mask, + total_len=total_len, + device=tokens.device, + ) + + eos_reached = torch.tensor([False] * bsz) + input_text_mask = tokens != pad_id + + if echo: + for i in range(max_prompt_len): + results = [] + for j, t in enumerate(tokens[:, i]): + results.append( + GenerationResult( + token=t.item(), + text=self.tokenizer.decode([t.item()]), + source="input", + logprobs=(token_logprobs[j, i : i + 1].tolist() if logprobs else None), + batch_idx=j, + finished=False, + ignore_token=t.item() == pad_id, + ) + ) + yield results + + stop_tokens = torch.tensor(self.tokenizer.stop_tokens) + + prev_pos = 0 + for cur_pos in range(min_prompt_len, total_len): + if is_vision: + position_ids = torch.arange(prev_pos, cur_pos, dtype=torch.long) + text_only_inference = all(inp.vision is None for inp in model_inputs) + logits = self.model.forward( + position_ids, + tokens, + cross_attention_masks, + full_text_row_masked_out_mask, + xattn_caches, + text_only_inference, + ) + else: + logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos) + + if logits_processor is not None: + logits = logits_processor(tokens[:, :cur_pos], logits) + + if temperature > 0: + probs = torch.softmax(logits[:, -1] / temperature, dim=-1) + next_token = sample_top_p(probs, top_p) + else: + next_token = torch.argmax(logits[:, -1], dim=-1) + + next_token = next_token.reshape(-1) + # only replace token if prompt has already been generated + next_token = torch.where(input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token) + tokens[:, cur_pos] = next_token + + target = tokens[:, prev_pos + 1 : cur_pos + 1] + if is_vision: + # the logits space (num_classes) is designed to never contain a media_token + # however our input token stream does contain them. we need to nuke them here + # or else the CUDA kernels will crash with an illegal memory access + vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256] + masks = [target.eq(t) for t in vision_tokens] + if len(masks) > 1: + mask = torch.logical_or(*masks) + else: + mask = masks[0] + target[mask] = 0 + + if logprobs: + token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy( + input=logits.transpose(1, 2), + target=target, + reduction="none", + ignore_index=pad_id, + ) + eos_reached |= (~input_text_mask[:, cur_pos]) & (torch.isin(next_token, stop_tokens)) + results = [] + for idx, t in enumerate(next_token): + results.append( + GenerationResult( + token=t.item(), + text=self.tokenizer.decode([t.item()]), + source="output", + logprobs=(token_logprobs[idx, cur_pos : cur_pos + 1].tolist() if logprobs else None), + batch_idx=idx, + finished=eos_reached[idx], + ignore_token=cur_pos < len(prompt_tokens[idx]), + ) + ) + yield results + + prev_pos = cur_pos + if all(eos_reached): + break + + def completion( + self, + contents: List[RawContent], + temperature: float = 0.6, + top_p: float = 0.9, + max_gen_len: Optional[int] = None, + logprobs: bool = False, + echo: bool = False, + ) -> Generator[List[GenerationResult], None, None]: + model_inputs = [self.formatter.encode_content(c) for c in contents] + for result in self.generate( + model_inputs=model_inputs, + temperature=temperature, + top_p=top_p, + max_gen_len=max_gen_len, + logprobs=logprobs, + echo=echo, + ): + yield result + if all(r.finished for r in result): + break + + def chat_completion( + self, + messages_batch: List[List[RawMessage]], + temperature: float = 0.6, + top_p: float = 0.9, + max_gen_len: Optional[int] = None, + logprobs: bool = False, + tool_prompt_format: ToolPromptFormat = ToolPromptFormat.json, + echo: bool = False, + ) -> Generator[List[GenerationResult], None, None]: + model_inputs = [self.formatter.encode_dialog_prompt(messages) for messages in messages_batch] + for result in self.generate( + model_inputs=model_inputs, + temperature=temperature, + top_p=top_p, + max_gen_len=max_gen_len, + logprobs=logprobs, + echo=echo, + ): + yield result + if all(r.finished for r in result): + break + + +def sample_top_p(probs, p): + """ + Perform top-p (nucleus) sampling on a probability distribution. + + Args: + probs (torch.Tensor): Probability distribution tensor. + p (float): Probability threshold for top-p sampling. + + Returns: + torch.Tensor: Sampled token indices. + + Note: + Top-p sampling selects the smallest set of tokens whose cumulative probability mass + exceeds the threshold p. The distribution is renormalized based on the selected tokens. + """ + probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) + probs_sum = torch.cumsum(probs_sort, dim=-1) + mask = probs_sum - probs_sort > p + probs_sort[mask] = 0.0 + probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) + next_token = torch.multinomial(probs_sort, num_samples=1) + next_token = torch.gather(probs_idx, -1, next_token) + return next_token diff --git a/llama_stack/models/llama/llama3/interface.py b/llama_stack/models/llama/llama3/interface.py index 2579ab6c8..8684237df 100644 --- a/llama_stack/models/llama/llama3/interface.py +++ b/llama_stack/models/llama/llama3/interface.py @@ -16,7 +16,7 @@ from typing import List, Optional from termcolor import colored -from llama_stack.models.llama.datatypes import ( +from ..datatypes import ( BuiltinTool, RawMessage, StopReason, @@ -24,7 +24,6 @@ from llama_stack.models.llama.datatypes import ( ToolDefinition, ToolPromptFormat, ) - from . import template_data from .chat_format import ChatFormat from .prompt_templates import ( diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/model.py b/llama_stack/models/llama/llama3/model.py similarity index 94% rename from llama_stack/providers/inline/inference/meta_reference/llama3/model.py rename to llama_stack/models/llama/llama3/model.py index a49167980..2562673e2 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama3/model.py +++ b/llama_stack/models/llama/llama3/model.py @@ -4,16 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. - import math from typing import Optional, Tuple @@ -29,6 +19,10 @@ from torch import nn from .args import ModelArgs +# **NOTE**: This code is not runnable without installing `torch` and `fairscale` +# dependencies. These dependencies are not part of the default dependencies +# (requirements.txt) of the `llama-models` package. + class RMSNorm(torch.nn.Module): def __init__(self, dim: int, eps: float = 1e-6): @@ -111,9 +105,9 @@ class Attention(nn.Module): def __init__(self, args: ModelArgs): super().__init__() self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads - model_parallel_size = fs_init.get_model_parallel_world_size() - self.n_local_heads = args.n_heads // model_parallel_size - self.n_local_kv_heads = self.n_kv_heads // model_parallel_size + world_size = fs_init.get_model_parallel_world_size() + self.n_local_heads = args.n_heads // world_size + self.n_local_kv_heads = self.n_kv_heads // world_size self.n_rep = self.n_local_heads // self.n_local_kv_heads self.head_dim = args.dim // args.n_heads diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/__init__.py b/llama_stack/models/llama/llama3/multimodal/__init__.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/__init__.py rename to llama_stack/models/llama/llama3/multimodal/__init__.py diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/encoder_utils.py b/llama_stack/models/llama/llama3/multimodal/encoder_utils.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/encoder_utils.py rename to llama_stack/models/llama/llama3/multimodal/encoder_utils.py diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/image_transform.py b/llama_stack/models/llama/llama3/multimodal/image_transform.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/image_transform.py rename to llama_stack/models/llama/llama3/multimodal/image_transform.py diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model.py b/llama_stack/models/llama/llama3/multimodal/model.py similarity index 95% rename from llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model.py rename to llama_stack/models/llama/llama3/multimodal/model.py index 3d0d77c87..0cb18b948 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model.py +++ b/llama_stack/models/llama/llama3/multimodal/model.py @@ -4,16 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. - import logging import math from functools import partial @@ -180,14 +170,14 @@ class ImageAttention(nn.Module): n_heads, ): super().__init__() - model_parallel_size = fs_init.get_model_parallel_world_size() + world_size = fs_init.get_model_parallel_world_size() qkvo_replication = 1 - if model_parallel_size > 16: - qkvo_replication = model_parallel_size // 8 + if world_size > 16: + qkvo_replication = world_size // 8 self.n_kv_heads = n_heads - self.n_local_heads = n_heads * qkvo_replication // model_parallel_size - self.n_local_kv_heads = self.n_kv_heads * qkvo_replication // model_parallel_size + self.n_local_heads = n_heads * qkvo_replication // world_size + self.n_local_kv_heads = self.n_kv_heads * qkvo_replication // world_size self.n_rep = self.n_local_heads // self.n_local_kv_heads self.head_dim = dim // n_heads @@ -536,16 +526,16 @@ class Attention(nn.Module): cache_v (torch.Tensor): Cached values for attention. """ super().__init__() - model_parallel_size = fs_init.get_model_parallel_world_size() + world_size = fs_init.get_model_parallel_world_size() replication_factor = 1 - if model_parallel_size > 8: - replication_factor = model_parallel_size // MP_SCALE + if world_size > 8: + replication_factor = world_size // MP_SCALE self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads self.n_kv_heads *= replication_factor - self.n_local_heads = args.n_heads // model_parallel_size - self.n_local_kv_heads = self.n_kv_heads // model_parallel_size + self.n_local_heads = args.n_heads // world_size + self.n_local_kv_heads = self.n_kv_heads // world_size self.n_rep = self.n_local_heads // self.n_local_kv_heads self.head_dim = args.dim // args.n_heads self.max_seq_len = args.max_seq_len @@ -587,13 +577,11 @@ class Attention(nn.Module): self.n_local_kv_heads, self.head_dim, ) - device = next(self.parameters()).device self.register_buffer( "key_cache", torch.zeros( cache_shape, dtype=dtype, - device=device, ), persistent=False, ) @@ -602,7 +590,6 @@ class Attention(nn.Module): torch.zeros( cache_shape, dtype=dtype, - device=device, ), persistent=False, ) @@ -614,6 +601,9 @@ class Attention(nn.Module): freqs_cis: torch.Tensor, position_ids: torch.LongTensor, ): + self.key_cache = self.key_cache.to(x.device) + self.value_cache = self.value_cache.to(x.device) + xq, xk, xv = [F.linear(x, w) for w in [self.wq.weight, self.wk.weight, self.wv.weight]] bs, slen, _ = xq.shape @@ -832,10 +822,10 @@ class CrossAttention(torch.nn.Module): norm_eps: float, ): super().__init__() - self.model_parallel_size = fs_init.get_model_parallel_world_size() + self.world_size = fs_init.get_model_parallel_world_size() replication_factor = 1 - if self.model_parallel_size > 8: - replication_factor = self.model_parallel_size // MP_SCALE + if self.world_size > 8: + replication_factor = self.world_size // MP_SCALE n_kv_heads *= replication_factor assert n_heads % n_kv_heads == 0 @@ -889,10 +879,10 @@ class CrossAttention(torch.nn.Module): # trunk LLM (i.e., group query attention) -- @dubeya # local heads assert self.n_heads % self.n_kv_heads == 0 - assert self.n_heads % self.model_parallel_size == 0 - assert self.n_kv_heads % self.model_parallel_size == 0 - self.n_local_heads = self.n_heads // self.model_parallel_size - self.n_local_kv_heads = self.n_kv_heads // self.model_parallel_size + assert self.n_heads % self.world_size == 0 + assert self.n_kv_heads % self.world_size == 0 + self.n_local_heads = self.n_heads // self.world_size + self.n_local_kv_heads = self.n_kv_heads // self.world_size self.n_rep = self.n_local_heads // self.n_local_kv_heads def _compute_xattn_kv_cache(self, xattn_tokens: torch.Tensor) -> torch.Tensor: @@ -1041,7 +1031,7 @@ class CrossAttentionTransformerVision(torch.nn.Module): self.image_res = args.vision_chunk_size self.max_num_chunks = args.vision_max_num_chunks if return_intermediate is not None: - return_intermediate = [int(level) for level in return_intermediate.split(",")] + return_intermediate = [int(layer) for layer in return_intermediate.split(",")] self.vision_input_dim = (len(return_intermediate) + 1) * self.vision_input_dim self.patch_size = 14 self.vision_encoder = VisionEncoder( @@ -1076,15 +1066,15 @@ class CrossAttentionTransformerText(torch.nn.Module): def __init__(self, args: ModelArgs) -> None: super().__init__() - self.model_parallel_size = fs_init.get_model_parallel_world_size() + self.world_size = fs_init.get_model_parallel_world_size() assert args.vocab_size > 0 self.vocab_size = args.vocab_size self.n_layers = args.n_layers self.dim = args.dim self.head_dim = args.dim // args.n_heads self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads - self.n_local_kv_heads = self.n_kv_heads // self.model_parallel_size - assert self.vocab_size % self.model_parallel_size == 0 + self.n_local_kv_heads = self.n_kv_heads // self.world_size + assert self.vocab_size % self.world_size == 0 self.tok_embeddings = VocabParallelEmbedding(args.vocab_size, args.dim, init_method=lambda x: x) self.pos_embeddings = None # final norm layer (not necessary for post-norm) @@ -1184,6 +1174,8 @@ class CrossAttentionTransformerText(torch.nn.Module): text_only_inference: bool = False, ): assert self.cache_is_setup, "Please set up cache before calling forward" + self.mask_cache = self.mask_cache.to(h.device) + self.freqs_cis = self.freqs_cis.to(h.device) mask = self.mask_cache.index_select(2, position_ids) freqs_cis = self.freqs_cis.index_select(0, position_ids) @@ -1212,9 +1204,8 @@ class CrossAttentionTransformerText(torch.nn.Module): output = gather_from_tensor_model_parallel_region(output) return output.float() - def setup_cache(self, max_batch_size: int, dtype=torch.bfloat16): + def setup_cache(self, max_batch_size: int, device: torch.device, dtype=torch.bfloat16): # Set up the text kv caches - device = next(self.parameters()).device ones = torch.ones( (self.max_seq_len, self.max_seq_len), dtype=torch.bool, @@ -1265,7 +1256,7 @@ class CrossAttentionTransformerText(torch.nn.Module): return ( cross_attention_masks.to(device=text_device, dtype=text_dtype), - full_text_row_masked_out_mask, + full_text_row_masked_out_mask.to(device=text_device), ) @@ -1284,14 +1275,15 @@ class CrossAttentionTransformer(torch.nn.Module): max_num_chunks=args.vision_max_num_chunks, ) - def setup_cache(self, max_batch_size: int, dtype: torch.dtype): - self.text_model.setup_cache(max_batch_size, dtype) + def setup_cache(self, max_batch_size: int, device: torch.device, dtype: torch.dtype): + self.text_model.setup_cache(max_batch_size, device, dtype) def compute_vision_tokens_masks( self, batch_images: List[List[PIL_Image.Image]], batch_masks: List[List[List[int]]], total_len: int, + device: torch.device, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: skip_vision_encoder = False @@ -1318,6 +1310,7 @@ class CrossAttentionTransformer(torch.nn.Module): image_res=self.params.vision_chunk_size, max_num_images=max_num_images, ) + stacked_images = stacked_images.to(device=device) if skip_vision_encoder: vision_tokens = torch.zeros( @@ -1330,7 +1323,7 @@ class CrossAttentionTransformer(torch.nn.Module): ), ) else: - vision_tokens = self.vision_model(stacked_images, aspect_ratios) + vision_tokens = self.vision_model(stacked_images, aspect_ratios).to(device=device) bsz, nimg, nchunk, ntok, image_token_dim = tuple(vision_tokens.shape) xattn_caches = torch.stack( diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/utils.py b/llama_stack/models/llama/llama3/multimodal/utils.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/utils.py rename to llama_stack/models/llama/llama3/multimodal/utils.py diff --git a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py index e03fcfc93..d4e825a22 100644 --- a/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +++ b/llama_stack/models/llama/llama3/prompt_templates/system_prompts.py @@ -15,7 +15,7 @@ import textwrap from datetime import datetime from typing import Any, List, Optional -from llama_stack.models.llama.datatypes import ( +from llama_stack.apis.inference import ( BuiltinTool, ToolDefinition, ToolParamDefinition, diff --git a/llama_stack/templates/meta-reference-quantized-gpu/__init__.py b/llama_stack/models/llama/llama3/quantization/__init__.py similarity index 74% rename from llama_stack/templates/meta-reference-quantized-gpu/__init__.py rename to llama_stack/models/llama/llama3/quantization/__init__.py index 1cfdb2c6a..756f351d8 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/__init__.py +++ b/llama_stack/models/llama/llama3/quantization/__init__.py @@ -3,5 +3,3 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. - -from .meta_reference import get_distribution_template # noqa: F401 diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/quantization/loader.py b/llama_stack/models/llama/llama3/quantization/loader.py similarity index 84% rename from llama_stack/providers/inline/inference/meta_reference/llama3/quantization/loader.py rename to llama_stack/models/llama/llama3/quantization/loader.py index 5109130b4..771fd02be 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama3/quantization/loader.py +++ b/llama_stack/models/llama/llama3/quantization/loader.py @@ -4,9 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. - # type: ignore import os from typing import Any, Dict, List, Optional, cast @@ -18,22 +15,15 @@ from fairscale.nn.model_parallel.mappings import reduce_from_model_parallel_regi from torch import Tensor, nn from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear -from llama_stack.apis.inference import QuantizationType -from llama_stack.log import get_logger -from llama_stack.models.llama.datatypes import CheckpointQuantizationFormat -from llama_stack.models.llama.sku_list import resolve_model -from llama_stack.providers.inline.inference.meta_reference.quantize_impls import ( +from ...datatypes import QuantizationMode +from ...quantize_impls import ( Fp8ScaledWeights, ffn_swiglu, load_fp8, quantize_fp8, ) - -from ...config import MetaReferenceQuantizedInferenceConfig -from ..args import ModelArgs from ..model import Transformer, TransformerBlock - -log = get_logger(__name__, category="quantization") +from ..multimodal.model import CrossAttentionTransformer def swiglu_wrapper( @@ -44,30 +34,34 @@ def swiglu_wrapper( return reduce_from_model_parallel_region(out) +def convert_to_quantized_model( + model: Transformer | CrossAttentionTransformer, + checkpoint_dir: str, + quantization_mode: Optional[str] = None, + fp8_activation_scale_ub: Optional[float] = 1200.0, + device: Optional[torch.device] = None, +) -> Transformer | CrossAttentionTransformer: + if quantization_mode == QuantizationMode.fp8_mixed: + return convert_to_fp8_quantized_model(model, checkpoint_dir, fp8_activation_scale_ub, device) + elif quantization_mode == QuantizationMode.int4_mixed: + return convert_to_int4_quantized_model(model, checkpoint_dir, device) + else: + raise ValueError(f"Unsupported quantization mode: {quantization_mode}") + + def convert_to_fp8_quantized_model( model: Transformer, - config: MetaReferenceQuantizedInferenceConfig, checkpoint_dir: str, fp8_activation_scale_ub: Optional[float] = 1200.0, + device: Optional[torch.device] = None, ) -> Transformer: - if config.quantization.type == QuantizationType.bf16.value: - return model - - elif config.quantization.type != QuantizationType.fp8.value: - raise ValueError("Only FP8 quantization is supported") - - assert config.model is not None, "Model must be specified for quantized inference" - llama_model = resolve_model(config.model) - assert llama_model is not None, f"Model {config.model} not found" - # Move weights to GPU with quantization - if llama_model.quantization_format == CheckpointQuantizationFormat.fp8_mixed.value: - log.info("Loading fp8 scales...") - fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt") - assert os.path.isfile(fp8_scales_path), f"fp8_scales_path not found for rank {get_model_parallel_rank()}" + fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{get_model_parallel_rank()}.pt") + if os.path.isfile(fp8_scales_path): + print("Loading fp8 scales...") fp8_scales = torch.load(fp8_scales_path, weights_only=True) - for block in model.layers: + for _, block in model.named_modules(): if isinstance(block, TransformerBlock): if block.layer_id == 0 or block.layer_id == (model.n_layers - 1): continue @@ -81,8 +75,8 @@ def convert_to_fp8_quantized_model( fp8_activation_scale_ub, ) else: - log.info("Quantizing fp8 weights from bf16...") - for block in model.layers: + print("Quantizing fp8 weights from bf16...") + for _, block in model.named_modules(): if isinstance(block, TransformerBlock): if block.layer_id == 0 or block.layer_id == (model.n_layers - 1): continue @@ -92,12 +86,12 @@ def convert_to_fp8_quantized_model( param.weight = quantize_fp8( param.weight, fp8_activation_scale_ub, - output_device=torch.device("cuda"), + output_device=device, ) for _, parameter in model.named_parameters(): if not isinstance(parameter, Fp8ScaledWeights): - parameter.data = parameter.to(device="cuda") + parameter.data = parameter.to(device=device) return model @@ -290,12 +284,12 @@ def _prepare_model_int4_weight_int8_dynamic_activation( def convert_to_int4_quantized_model( - model: Transformer, - model_args: ModelArgs, - config: MetaReferenceQuantizedInferenceConfig, -) -> Transformer: + model: Transformer | CrossAttentionTransformer, + checkpoint_dir: str, + device: Optional[torch.device] = None, +) -> Transformer | CrossAttentionTransformer: """Convert the model to int4 quantized model.""" - + model_args = model.params assert model_args.quantization_args is not None, "Quantization args must be specified." quantization_args = model_args.quantization_args if quantization_args.scheme is None: @@ -319,5 +313,4 @@ def convert_to_int4_quantized_model( lora_scale = model_args.lora_args.scale _prepare_model_int4_weight_int8_dynamic_activation(model, group_size, lora_rank, lora_scale) - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - return cast(Transformer, model.to(device)) + return cast(Transformer | CrossAttentionTransformer, model.to(device=device)) diff --git a/llama_stack/models/llama/llama3/template_data.py b/llama_stack/models/llama/llama3/template_data.py index 076b4adb4..efca8397e 100644 --- a/llama_stack/models/llama/llama3/template_data.py +++ b/llama_stack/models/llama/llama3/template_data.py @@ -12,8 +12,7 @@ # the top-level of this source tree. -from llama_stack.models.llama.datatypes import BuiltinTool, StopReason, ToolCall - +from ..datatypes import BuiltinTool, StopReason, ToolCall from .prompt_templates import ( BuiltinToolGenerator, JsonCustomToolGenerator, diff --git a/llama_stack/models/llama/llama3/tokenizer.py b/llama_stack/models/llama/llama3/tokenizer.py index b240fa246..d3cc4fc07 100644 --- a/llama_stack/models/llama/llama3/tokenizer.py +++ b/llama_stack/models/llama/llama3/tokenizer.py @@ -4,16 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. - import os from logging import getLogger from pathlib import Path diff --git a/llama_stack/models/llama/llama3/tool_utils.py b/llama_stack/models/llama/llama3/tool_utils.py index 71018898c..fc8287eb6 100644 --- a/llama_stack/models/llama/llama3/tool_utils.py +++ b/llama_stack/models/llama/llama3/tool_utils.py @@ -16,7 +16,8 @@ import re from typing import Optional, Tuple from llama_stack.log import get_logger -from llama_stack.models.llama.datatypes import BuiltinTool, RecursiveType, ToolCall, ToolPromptFormat + +from ..datatypes import BuiltinTool, RecursiveType, ToolCall, ToolPromptFormat logger = get_logger(name=__name__, category="inference") diff --git a/llama_stack/models/llama/llama3_2/__init__.py b/llama_stack/models/llama/llama3_2/__init__.py index 38ee47d66..756f351d8 100644 --- a/llama_stack/models/llama/llama3_2/__init__.py +++ b/llama_stack/models/llama/llama3_2/__init__.py @@ -3,10 +3,3 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. diff --git a/llama_stack/models/llama/llama3_2/prompts_text.py b/llama_stack/models/llama/llama3_2/prompts_text.py index 7bc7e3219..7a1f9887c 100644 --- a/llama_stack/models/llama/llama3_2/prompts_text.py +++ b/llama_stack/models/llama/llama3_2/prompts_text.py @@ -4,12 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. import json import textwrap diff --git a/llama_stack/models/llama/llama3_2/prompts_vision.py b/llama_stack/models/llama/llama3_2/prompts_vision.py index b1ede418b..b0f11cab6 100644 --- a/llama_stack/models/llama/llama3_2/prompts_vision.py +++ b/llama_stack/models/llama/llama3_2/prompts_vision.py @@ -4,13 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - import textwrap from pathlib import Path diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/args.py b/llama_stack/models/llama/llama4/args.py similarity index 91% rename from llama_stack/providers/inline/inference/meta_reference/llama4/args.py rename to llama_stack/models/llama/llama4/args.py index 046448ef6..6d7c1d409 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama4/args.py +++ b/llama_stack/models/llama/llama4/args.py @@ -4,13 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - from enum import Enum from typing import Optional diff --git a/llama_stack/models/llama/llama4/chat_format.py b/llama_stack/models/llama/llama4/chat_format.py index c873012d6..160bb00f8 100644 --- a/llama_stack/models/llama/llama4/chat_format.py +++ b/llama_stack/models/llama/llama4/chat_format.py @@ -13,7 +13,7 @@ import torch from PIL import Image as PIL_Image # TODO: either fork these or move them to the common package -from llama_stack.models.llama.datatypes import ( +from ..datatypes import ( BuiltinTool, RawContent, RawMediaItem, @@ -24,16 +24,10 @@ from llama_stack.models.llama.datatypes import ( ToolCall, ToolPromptFormat, ) -from llama_stack.models.llama.llama3.tool_utils import ToolUtils -from llama_stack.providers.inline.inference.meta_reference.llama4.args import VisionArgs -from llama_stack.providers.inline.inference.meta_reference.llama4.datatypes import ( - LLMInput, -) -from llama_stack.providers.inline.inference.meta_reference.llama4.preprocess import ( - ResizeNormalizeImageTransform, - VariableSizeImageTransform, -) - +from ..llama3.tool_utils import ToolUtils +from .args import VisionArgs +from .datatypes import LLMInput +from .preprocess import ResizeNormalizeImageTransform, VariableSizeImageTransform from .tokenizer import Tokenizer @@ -54,7 +48,7 @@ class TransformedImage: aspect_ratio: Tuple[int, int] -def convert_rgba_to_rgb(image: PIL_Image.Image, bg: Tuple[int, int, int] = (255, 255, 255)) -> PIL_Image.Image: +def convert_image_to_rgb(image: PIL_Image.Image, bg: Tuple[int, int, int] = (255, 255, 255)) -> PIL_Image.Image: if image.mode == "RGBA": image.load() # for png.split() new_img = PIL_Image.new("RGB", image.size, bg) @@ -171,7 +165,7 @@ class ChatFormat: bytes_io = io.BytesIO(c.data) if isinstance(c.data, bytes) else c.data image = PIL_Image.open(bytes_io) - image = convert_rgba_to_rgb(image) + image = convert_image_to_rgb(image) image_tiles, ar = self.dynamic_image_transform(image, max_num_chunks=self.max_num_chunks) if image_tiles.shape[0] > 1: diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/datatypes.py b/llama_stack/models/llama/llama4/datatypes.py similarity index 85% rename from llama_stack/providers/inline/inference/meta_reference/llama4/datatypes.py rename to llama_stack/models/llama/llama4/datatypes.py index bb1c19a12..27174db63 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama4/datatypes.py +++ b/llama_stack/models/llama/llama4/datatypes.py @@ -4,13 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - from dataclasses import dataclass from typing import List, Optional, Union diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/ffn.py b/llama_stack/models/llama/llama4/ffn.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/llama4/ffn.py rename to llama_stack/models/llama/llama4/ffn.py diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/generation.py b/llama_stack/models/llama/llama4/generation.py similarity index 72% rename from llama_stack/providers/inline/inference/meta_reference/llama4/generation.py rename to llama_stack/models/llama/llama4/generation.py index de900ce8d..7a4087c8f 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama4/generation.py +++ b/llama_stack/models/llama/llama4/generation.py @@ -10,40 +10,28 @@ import json import os import sys import time -from enum import Enum from pathlib import Path from typing import Callable, Generator, List, Optional import torch import torch.nn.functional as F from fairscale.nn.model_parallel.initialize import ( - get_model_parallel_rank, initialize_model_parallel, model_parallel_is_initialized, ) from termcolor import cprint -from llama_stack.models.llama.llama4.chat_format import ( - ChatFormat, - RawContent, - RawMessage, -) -from llama_stack.models.llama.llama4.tokenizer import Tokenizer - -from ..common import TokenResult +from ..checkpoint import maybe_reshard_state_dict +from ..datatypes import GenerationResult, QuantizationMode from .args import ModelArgs +from .chat_format import ChatFormat, RawContent, RawMessage from .datatypes import LLMInput, MaskedEmbedding, TransformerInput from .model import Transformer +from .tokenizer import Tokenizer torch.serialization.add_safe_globals([io.BytesIO, codecs.encode]) -class QuantizationMode(str, Enum): - none = "none" - fp8_mixed = "fp8_mixed" - int4_mixed = "int4_mixed" - - class Llama4: @staticmethod def build( @@ -51,7 +39,7 @@ class Llama4: max_seq_len: int, max_batch_size: int, world_size: Optional[int] = None, - quantization_mode: Optional[str] = None, + quantization_mode: Optional[QuantizationMode] = None, seed: int = 1, ): if not torch.distributed.is_initialized(): @@ -72,11 +60,9 @@ class Llama4: start_time = time.time() - checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) - assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}" - assert world_size == len(checkpoints), ( - f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" - ) + ckpt_paths = sorted(Path(ckpt_dir).glob("*.pth")) + assert len(ckpt_paths) > 0, f"no checkpoint files found in {ckpt_dir}" + print(f"Loading a checkpoint (shards={len(ckpt_paths)}, current-mp-size={world_size})") with open(Path(ckpt_dir) / "params.json", "r") as f: params = json.loads(f.read()) @@ -93,10 +79,11 @@ class Llama4: assert model_args.vocab_size == tokenizer.n_words, f"{model_args.vocab_size=} vs. {tokenizer.n_words=} mismatch" print("Model args:\n", model_args.model_dump_json(indent=2)) - ckpt_path = checkpoints[get_model_parallel_rank()] - print(f"Loading checkpoint from {ckpt_dir}...") - with open(ckpt_path, "rb") as f: - checkpoint = torch.load(f, map_location="cpu", weights_only=True) + state_dict = maybe_reshard_state_dict( + ckpt_paths, + n_kv_heads=model_args.n_kv_heads if model_args.n_kv_heads else model_args.n_heads, + moe_num_experts=model_args.moe_args.num_experts, + ) print("Loaded checkpoint") if quantization_mode == QuantizationMode.fp8_mixed or quantization_mode == QuantizationMode.int4_mixed: from .quantization.loader import convert_to_quantized_model @@ -104,9 +91,9 @@ class Llama4: torch.set_default_tensor_type(torch.BFloat16Tensor) model = Transformer(model_args) print("Loading state dict...") - model.load_state_dict(checkpoint, strict=False) + model.load_state_dict(state_dict, strict=False) print("Done...") - model = convert_to_quantized_model(model, ckpt_dir) + model = convert_to_quantized_model(model, ckpt_dir, quantization_mode) else: if torch.cuda.is_bf16_supported(): torch.set_default_tensor_type(torch.cuda.BFloat16Tensor) @@ -115,7 +102,7 @@ class Llama4: model = Transformer(model_args) print("Loading state dict...") - model.load_state_dict(checkpoint, strict=False) + model.load_state_dict(state_dict, strict=False) print("Done...") print(f"Loaded in {time.time() - start_time:.2f} seconds") @@ -130,7 +117,7 @@ class Llama4: @torch.inference_mode() def generate( self, - llm_input: LLMInput, + llm_inputs: List[LLMInput], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None, @@ -138,22 +125,20 @@ class Llama4: echo: bool = False, print_model_input: bool = False, logits_processor: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, - ) -> Generator: + ) -> Generator[List[GenerationResult], None, None]: if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.model.args.max_seq_len: max_gen_len = self.model.args.max_seq_len - 1 params = self.model.args print_model_input = print_model_input or os.environ.get("LLAMA_MODELS_DEBUG", "0") == "1" - if print_model_input and get_model_parallel_rank() == 0: - tokens_to_print = list(llm_input.tokens) - cprint( - "Input to model:\n" + self.tokenizer.decode(tokens_to_print) + "\n", - "red", - ) - prompt_tokens = [llm_input.tokens] + if print_model_input: + cprint("Input to model:\n", "yellow") + for inp in llm_inputs: + cprint(self.tokenizer.decode(inp.tokens), "grey") + prompt_tokens = [inp.tokens for inp in llm_inputs] - bsz = 1 + bsz = len(llm_inputs) assert bsz <= params.max_batch_size, (bsz, params.max_batch_size) min_prompt_len = min(len(t) for t in prompt_tokens) @@ -176,24 +161,33 @@ class Llama4: input_text_mask = tokens != pad_id if echo: - for i, t in enumerate(llm_input.tokens): - yield TokenResult( - token=t, - text=self.tokenizer.decode([t]), - logprobs=(token_logprobs[0, i : i + 1].tolist() if logprobs else None), - ) + for i in range(max_prompt_len): + results = [] + for j, t in enumerate(tokens[:, i]): + results.append( + GenerationResult( + token=t.item(), + text=self.tokenizer.decode([t.item()]), + source="input", + logprobs=(token_logprobs[j, i : i + 1].tolist() if logprobs else None), + batch_idx=j, + finished=False, + ignore_token=t.item() == pad_id, + ) + ) + yield results stop_tokens = torch.tensor(self.tokenizer.stop_tokens, device="cuda") prev_pos = 0 for cur_pos in range(min_prompt_len, total_len): image_embedding = None - if prev_pos == 0 and llm_input.images is not None and len(llm_input.images) > 0: + if prev_pos == 0 and any(inp.images is not None and len(inp.images) > 0 for inp in llm_inputs): image_mask = tokens[:, prev_pos:cur_pos] == self.tokenizer.special_tokens["<|patch|>"] image_mask = image_mask.unsqueeze(-1) h = self.model.tok_embeddings(tokens[:, prev_pos:cur_pos]) - image_batch = [llm_input.images] + image_batch = [inp.images if inp.images is not None else [] for inp in llm_inputs] image_embedding = MaskedEmbedding( embedding=self.model.vision_embeddings(image_batch, image_mask, h), mask=image_mask, @@ -229,11 +223,21 @@ class Llama4: ignore_index=pad_id, ) eos_reached |= (~input_text_mask[:, cur_pos]) & (torch.isin(next_token, stop_tokens)) - yield TokenResult( - token=next_token[0].item(), - text=self.tokenizer.decode(next_token.tolist()), - logprobs=(token_logprobs[:, cur_pos : cur_pos + 1][0].tolist() if logprobs else None), - ) + + results = [] + for idx, t in enumerate(next_token): + results.append( + GenerationResult( + token=t.item(), + text=self.tokenizer.decode([t.item()]), + source="output", + logprobs=(token_logprobs[idx, cur_pos : cur_pos + 1].tolist() if logprobs else None), + batch_idx=idx, + finished=eos_reached[idx], + ignore_token=cur_pos < len(prompt_tokens[idx]), + ) + ) + yield results prev_pos = cur_pos if all(eos_reached): @@ -241,68 +245,47 @@ class Llama4: def completion( self, - content: RawContent, + contents: List[RawContent], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None, logprobs: bool = False, echo: bool = False, - ) -> Generator: - llm_input = self.formatter.encode_content(content) + ) -> Generator[List[GenerationResult], None, None]: + llm_inputs = [self.formatter.encode_content(c) for c in contents] for result in self.generate( - llm_input=llm_input, + llm_inputs=llm_inputs, temperature=temperature, top_p=top_p, max_gen_len=max_gen_len, logprobs=logprobs, echo=echo, ): - if result.token in self.tokenizer.stop_tokens: - break yield result + if all(r.finished for r in result): + break def chat_completion( self, - messages: List[RawMessage], + messages_batch: List[List[RawMessage]], temperature: float = 0.6, top_p: float = 0.9, max_gen_len: Optional[int] = None, logprobs: bool = False, echo: bool = False, - ) -> Generator: - llm_input = self.formatter.encode_dialog_prompt(messages) + ) -> Generator[List[GenerationResult], None, None]: + llm_inputs = [self.formatter.encode_dialog_prompt(messages) for messages in messages_batch] for result in self.generate( - llm_input=llm_input, + llm_inputs=llm_inputs, temperature=temperature, top_p=top_p, max_gen_len=max_gen_len, logprobs=logprobs, echo=echo, ): - if result.token in self.tokenizer.stop_tokens: - break yield result - - def chat_completion_raw( - self, - messages: List[RawMessage], - temperature: float = 0.6, - top_p: float = 0.9, - max_gen_len: Optional[int] = None, - logprobs: bool = False, - ): - llm_input = self.formatter.encode_dialog_prompt(messages) - output_tokens = [] - for result in self.generate( - llm_input=llm_input, - temperature=temperature, - top_p=top_p, - max_gen_len=max_gen_len, - logprobs=logprobs, - ): - output_tokens.append(result.token) - - return llm_input.tokens, output_tokens + if all(r.finished for r in result): + break def sample_top_p(probs, p): diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/model.py b/llama_stack/models/llama/llama4/model.py similarity index 97% rename from llama_stack/providers/inline/inference/meta_reference/llama4/model.py rename to llama_stack/models/llama/llama4/model.py index a35d6857f..08fac7714 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama4/model.py +++ b/llama_stack/models/llama/llama4/model.py @@ -4,16 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. - import math from typing import Any, Dict, List, Optional, Tuple @@ -184,7 +174,6 @@ class Attention(nn.Module): self.head_dim, ) ).cuda() - self.qk_norm = None if self.use_qk_norm: self.qk_norm = L2Norm(args.norm_eps) diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/moe.py b/llama_stack/models/llama/llama4/moe.py similarity index 87% rename from llama_stack/providers/inline/inference/meta_reference/llama4/moe.py rename to llama_stack/models/llama/llama4/moe.py index 8cecab7dd..2ce49e915 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama4/moe.py +++ b/llama_stack/models/llama/llama4/moe.py @@ -100,31 +100,21 @@ class Experts(nn.Module): class MoE(torch.nn.Module): """ - This EC implementation is modified from the original EC module. - We refactored the token permutation and unpermutation logic and added support to tp and dp2ep sharding. - This module supports 3 sharding methods of the experts: - - tp: each TP rank has n_experts experts. Experts are sharded following the conventional row/column-parallel TP sharding. - - tp2ep: each TP rank has n_experts/tp experts. Experts are not sharded. - - dp2ep: each EP rank has n_experts/ep experts. Experts are sharded following the row/column-parallel TP sharding. Tensors used in this module are annotated with the suffixes that indicate the shape of the tensor. Several commonly used annotations include: - a: bsz*slen - E: number of experts - e: number of local experts per ep (n_experts/ep) - - et: number of local experts per tp (n_experts/tp) - D: hidden dimension - d: D/tp - F: model dimension - - f: F/tp (used in column/row-parallel linear) - G: number of tokens per expert (a * capacity_factor / E) - g: number of tokens per expert per TP rank (i.e., G/TP) - - GG: G*EP (number of tokens per expert received via inter-EP a2a when ag_along_first_dim=False) - - gg: g*EP (number of tokens per expert received via inter-EP a2a when ag_along_first_dim=True) Examples: x_aD [a, D] routed_in_etG_D [et*G, D] - x_eGGD: [e, GG, D] + x_eGD: [e, G, D] """ def __init__( @@ -207,13 +197,13 @@ class MoE(torch.nn.Module): routed_in_EG_D = routed_in_EG_D * router_scores.reshape(-1, 1) out_aD = self.shared_expert(x_aD) - routed_out_egg_D = self.experts(routed_in_EG_D.detach()) + routed_out_eg_D = self.experts(routed_in_EG_D.detach()) router_indices_EG_D = router_indices.reshape(-1, 1).expand(-1, D) out_aD.scatter_add_( dim=0, index=router_indices_EG_D, - src=routed_out_egg_D.view(-1, D), + src=routed_out_eg_D.view(-1, D), ) out_aD = reduce_from_model_parallel_region(out_aD) return out_aD.view(-1, slen, D) diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/preprocess.py b/llama_stack/models/llama/llama4/preprocess.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/llama4/preprocess.py rename to llama_stack/models/llama/llama4/preprocess.py diff --git a/llama_stack/models/llama/llama4/prompts.py b/llama_stack/models/llama/llama4/prompts.py index 97f573ef8..13b96359a 100644 --- a/llama_stack/models/llama/llama4/prompts.py +++ b/llama_stack/models/llama/llama4/prompts.py @@ -4,20 +4,13 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - import textwrap from io import BytesIO from pathlib import Path from typing import List -from llama_stack.models.llama.datatypes import RawMediaItem, RawMessage, RawTextItem -from llama_stack.models.llama.prompt_format import ( +from ..datatypes import RawMediaItem, RawMessage, RawTextItem +from ..prompt_format import ( Llama4UseCase, TextCompletionContent, UseCase, diff --git a/llama_stack/models/llama/llama4/quantization/__init__.py b/llama_stack/models/llama/llama4/quantization/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/llama_stack/models/llama/llama4/quantization/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/quantization/loader.py b/llama_stack/models/llama/llama4/quantization/loader.py similarity index 70% rename from llama_stack/providers/inline/inference/meta_reference/llama4/quantization/loader.py rename to llama_stack/models/llama/llama4/quantization/loader.py index 69aa309fa..b50432896 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama4/quantization/loader.py +++ b/llama_stack/models/llama/llama4/quantization/loader.py @@ -6,20 +6,29 @@ import logging import os -from typing import Optional +from typing import Callable, Optional import torch from fairscale.nn.model_parallel.initialize import get_model_parallel_rank -from torch import Tensor +from torch import Tensor, nn from torch.nn import functional as F -from ..generation import QuantizationMode +from ...datatypes import QuantizationMode from ..model import Transformer, TransformerBlock from ..moe import MoE log = logging.getLogger(__name__) +def swiglu_wrapper_no_reduce( + self, + x: Tensor, +): + from ...quantize_impls import ffn_swiglu + + return ffn_swiglu(x, self.w1.weight, self.w3.weight, self.w2.weight) + + def experts_batched_swiglu_wrapper( self, x: Tensor, # (e, g, D) @@ -51,24 +60,30 @@ def convert_to_quantized_model( rank = get_model_parallel_rank() + def should_quantize_block(block: nn.Module) -> bool: + if not isinstance(block, TransformerBlock): + return False + + is_moe = isinstance(block.feed_forward, MoE) + if quantization_mode == QuantizationMode.fp8_mixed: + # skip quantization on first and last layers + return is_moe and not (block.layer_id == 0 or block.layer_id == (model.n_layers - 1)) + + return is_moe + use_rich_progress = use_rich_progress and rank == 0 - progress, log_status, update_status = logging_callbacks(use_rich_progress, rank, model) + progress, log_status, update_status = logging_callbacks(use_rich_progress, rank, model, should_quantize_block) if quantization_mode == QuantizationMode.int4_mixed: int4_scales_path = os.path.join(checkpoint_dir, f"int4_scales_{rank}.pt") - int4_zero_points_path = os.path.join(checkpoint_dir, f"int4_zero_points_{rank}.pt") if os.path.isfile(int4_scales_path): log_status(f"Rank {rank}: Loading int4 scales") int4_scales = torch.load(int4_scales_path, weights_only=True) - int4_zero_points = torch.load(int4_zero_points_path, weights_only=True) def apply_quantization(key, weight): scale = int4_scales[key] - zero_point = int4_zero_points[key] return load_int4( weight, scale, - zero_point, - fp8_activation_scale_ub, output_device=torch.device("cuda"), ) @@ -77,6 +92,7 @@ def convert_to_quantized_model( def apply_quantization(_, weight): return quantize_int4(weight, fp8_activation_scale_ub, output_device=torch.device("cuda")) + else: fp8_scales_path = os.path.join(checkpoint_dir, f"fp8_scales_{rank}.pt") if os.path.isfile(fp8_scales_path): @@ -104,33 +120,38 @@ def convert_to_quantized_model( progress.start() for _, block in model.named_modules(): - if isinstance(block, TransformerBlock): - # Skip quantization on first and last layers - if block.layer_id == 0 or block.layer_id == (model.n_layers - 1): - continue + if not should_quantize_block(block): + continue - # Skip quantization on dense layers - if not isinstance(block.feed_forward, MoE): - continue + update_status(f"Rank {rank} - Layer {block.layer_id}") - update_status(f"Rank {rank} - Layer {block.layer_id}") + # Quantize only routed experts, not shared + prefix = f"layers.{block.layer_id}.feed_forward" + moe = block.feed_forward + moe.experts.batched_swiglu = experts_batched_swiglu_wrapper.__get__(moe.experts) - # Quantize only routed experts, not shared - prefix = f"layers.{block.layer_id}.feed_forward" - moe = block.feed_forward - moe.experts.batched_swiglu = experts_batched_swiglu_wrapper.__get__(moe.experts) + for key in ("w1", "w3", "w2"): + param = getattr(moe.experts, key) + update_status(f"Rank {rank} - Layer {block.layer_id} - MoE {key}") + setattr( + moe.experts, + key, + apply_quantization( + f"{prefix}.experts.{key}", + param.transpose(1, 2).contiguous(), + ), + ) + if quantization_mode == QuantizationMode.int4_mixed: + # Quantize shared experts + moe.shared_expert.forward = swiglu_wrapper_no_reduce.__get__(moe.shared_expert) for key in ("w1", "w3", "w2"): - param = getattr(moe.experts, key) - update_status(f"Rank {rank} - Layer {block.layer_id} - MoE {key}") - setattr( - moe.experts, - key, - apply_quantization(f"{prefix}.experts.{key}", param.transpose(1, 2).contiguous()), - ) + param = getattr(moe.shared_expert, key) + update_status(f"Rank {rank} - Layer {block.layer_id} - MoE shared expert {key}") + param.weight = apply_quantization(f"{prefix}.shared_expert.{key}", param.weight) - processed_blocks += 1 - update_status(message=None, completed=processed_blocks) + processed_blocks += 1 + update_status(message=None, completed=processed_blocks) update_status(f"Rank {rank} - Moving parameters to CUDA") @@ -149,7 +170,12 @@ def convert_to_quantized_model( # fp8/int4 loading can be very slow so we add progress bars to make life slightly better -def logging_callbacks(use_rich_progress: bool, rank: int, model: Transformer): +def logging_callbacks( + use_rich_progress: bool, + rank: int, + model: Transformer, + should_quantize_block: Callable[[nn.Module], bool], +): console = None if use_rich_progress: from rich.console import Console @@ -162,15 +188,7 @@ def logging_callbacks(use_rich_progress: bool, rank: int, model: Transformer): elif rank == 0: # Only log from rank 0 for non-rich logging log.info(message) - total_blocks = sum( - 1 - for _, block in model.named_modules() - if ( - isinstance(block, TransformerBlock) - and not (block.layer_id == 0 or block.layer_id == (model.n_layers - 1)) - and isinstance(block.feed_forward, MoE) - ) - ) + total_blocks = sum(1 for _, block in model.named_modules() if should_quantize_block(block)) progress = None if use_rich_progress: from rich.progress import ( diff --git a/llama_stack/models/llama/llama4/tokenizer.py b/llama_stack/models/llama/llama4/tokenizer.py index c1347daca..4d271e5fd 100644 --- a/llama_stack/models/llama/llama4/tokenizer.py +++ b/llama_stack/models/llama/llama4/tokenizer.py @@ -4,9 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement. - import os from logging import getLogger from pathlib import Path @@ -59,8 +56,6 @@ LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS = [ "<|text_post_train_reserved_special_token_3|>", "<|text_post_train_reserved_special_token_4|>", "<|text_post_train_reserved_special_token_5|>", - "<|python_start|>", - "<|python_end|>", "<|finetune_right_pad|>", ] + get_reserved_special_tokens( "text_post_train", 61, 6 @@ -85,8 +80,23 @@ LLAMA4_VISION_SPECIAL_TOKENS = [ "vision", 1041, 7 ) # <|vision_reserved_special_token_7|>, ..., <|vision_reserved_special_token_1047|> +# 201134, ..., 201143 +LLAMA4_REASONING_SPECIAL_TOKENS = [ + "<|reasoning_reserved_special_token_0|>", + "<|reasoning_reserved_special_token_1|>", + "<|reasoning_reserved_special_token_2|>", + "<|reasoning_reserved_special_token_3|>", + "<|reasoning_reserved_special_token_4|>", + "<|reasoning_reserved_special_token_5|>", + "<|reasoning_reserved_special_token_6|>", + "<|reasoning_reserved_special_token_7|>", + "<|reasoning_thinking_start|>", + "<|reasoning_thinking_end|>", +] -LLAMA4_SPECIAL_TOKENS = LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS + LLAMA4_VISION_SPECIAL_TOKENS +LLAMA4_SPECIAL_TOKENS = ( + LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS + LLAMA4_VISION_SPECIAL_TOKENS + LLAMA4_REASONING_SPECIAL_TOKENS +) BASIC_SPECIAL_TOKENS = [ "<|begin_of_text|>", @@ -155,6 +165,9 @@ class Tokenizer: self.eot_id: int = self.special_tokens["<|eot|>"] self.eom_id: int = self.special_tokens["<|eom|>"] + self.thinking_start_id: int = self.special_tokens["<|reasoning_thinking_start|>"] + self.thinking_end_id: int = self.special_tokens["<|reasoning_thinking_end|>"] + self.stop_tokens = [ self.eos_id, self.special_tokens["<|eom|>"], diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/vision/embedding.py b/llama_stack/models/llama/llama4/vision/embedding.py similarity index 96% rename from llama_stack/providers/inline/inference/meta_reference/llama4/vision/embedding.py rename to llama_stack/models/llama/llama4/vision/embedding.py index 73b29cbef..ed7659a73 100644 --- a/llama_stack/providers/inline/inference/meta_reference/llama4/vision/embedding.py +++ b/llama_stack/models/llama/llama4/vision/embedding.py @@ -4,13 +4,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - import math from typing import Any, Callable, Dict, List diff --git a/llama_stack/providers/inline/inference/meta_reference/llama4/vision/encoder.py b/llama_stack/models/llama/llama4/vision/encoder.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/llama4/vision/encoder.py rename to llama_stack/models/llama/llama4/vision/encoder.py diff --git a/llama_stack/models/llama/prompt_format.py b/llama_stack/models/llama/prompt_format.py index 695c0bf74..edb34620c 100644 --- a/llama_stack/models/llama/prompt_format.py +++ b/llama_stack/models/llama/prompt_format.py @@ -28,9 +28,6 @@ from llama_stack.models.llama.datatypes import ( ToolPromptFormat, ) from llama_stack.models.llama.llama4.tokenizer import Tokenizer -from llama_stack.providers.inline.inference.meta_reference.llama4.datatypes import ( - LLMInput, -) from .llama3.interface import LLama31Interface from .llama3.template_data import ( @@ -76,21 +73,22 @@ class UseCase(BaseModel): text += dialog text += "\n\n" continue - - elif isinstance(dialog, TextCompletionContent): - input_tokens, output_tokens = generator.text_completion_raw( - dialog.content, - temperature=0.1, - top_p=0.95, - max_gen_len=64, - ) else: - input_tokens, output_tokens = generator.chat_completion_raw( - dialog, - temperature=0.0, - top_p=0.95, - max_gen_len=self.max_gen_len, + batch = [dialog] + method = ( + generator.completion if isinstance(dialog, TextCompletionContent) else generator.chat_completion ) + input_tokens = [] + output_tokens = [] + for token_results in method(batch, echo=True, temperature=0.1, top_p=0.95): + result = token_results[0] + if result.source == "input": + input_tokens.append(result.token) + else: + output_tokens.append(result.token) + + if result.finished: + break text += "##### Input Prompt Format\n" # FIXME: This is added to undo the hack in chat_formatter where @@ -126,27 +124,27 @@ class Llama4UseCase(UseCase): text = "" tokenizer = Tokenizer.get_instance() - temperature = 0.0 for dialog in self.dialogs: if isinstance(dialog, str): text += dialog text += "\n\n" continue - - elif isinstance(dialog, TextCompletionContent): - # TODO pass the raw input and do the encoding in the text completion function - input_tokens = tokenizer.encode(dialog.content, bos=True, eos=False) - llm_input = LLMInput(tokens=input_tokens) - output_tokens, decoded_tokens, token_logprobs = generator.text_completion_raw( - llm_input, temperature=temperature, max_gen_len=self.max_gen_len - ) - else: - input_tokens, output_tokens = generator.chat_completion_raw( - dialog, - temperature=temperature, - max_gen_len=self.max_gen_len, + batch = [dialog] + method = ( + generator.completion if isinstance(dialog, TextCompletionContent) else generator.chat_completion ) + input_tokens = [] + output_tokens = [] + for token_results in method(batch, echo=True, temperature=0.0): + result = token_results[0] + if result.source == "input": + input_tokens.append(result.token) + else: + output_tokens.append(result.token) + + if result.finished: + break text += "##### Input Prompt Format\n" text += _code_block(tokenizer.decode(input_tokens)) diff --git a/llama_stack/providers/inline/inference/meta_reference/quantize_impls.py b/llama_stack/models/llama/quantize_impls.py similarity index 100% rename from llama_stack/providers/inline/inference/meta_reference/quantize_impls.py rename to llama_stack/models/llama/quantize_impls.py diff --git a/llama_stack/models/llama/sku_list.py b/llama_stack/models/llama/sku_list.py index dd3144bb0..513481831 100644 --- a/llama_stack/models/llama/sku_list.py +++ b/llama_stack/models/llama/sku_list.py @@ -4,24 +4,15 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - from dataclasses import dataclass from functools import lru_cache from typing import List, Optional -from .datatypes import ( +from .sku_types import ( CheckpointQuantizationFormat, CoreModelId, Model, ModelFamily, - SamplingParams, - TopPSamplingStrategy, ) LLAMA2_VOCAB_SIZE = 32000 @@ -47,15 +38,6 @@ def all_registered_models() -> List[Model]: ) -def recommended_sampling_params() -> SamplingParams: - return SamplingParams( - strategy=TopPSamplingStrategy( - temperature=1.0, - top_p=0.9, - ) - ) - - def llama2_family() -> List[Model]: return [ *llama2_base_models(), @@ -150,7 +132,6 @@ def llama2_base_models() -> List[Model]: core_model_id=CoreModelId.llama2_7b, description="Llama 2 7b model", huggingface_repo="meta-llama/Llama-2-7b", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -169,7 +150,6 @@ def llama2_base_models() -> List[Model]: core_model_id=CoreModelId.llama2_13b, description="Llama 2 13b model", huggingface_repo="meta-llama/Llama-2-13b", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 5120, "n_layers": 40, @@ -188,7 +168,6 @@ def llama2_base_models() -> List[Model]: core_model_id=CoreModelId.llama2_70b, description="Llama 2 70b model", huggingface_repo="meta-llama/Llama-2-70b", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -230,7 +209,6 @@ def llama3_base_models() -> List[Model]: core_model_id=CoreModelId.llama3_70b, description="Llama 3 70b model", huggingface_repo="meta-llama/Llama-3-70B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -254,7 +232,6 @@ def llama3_1_base_models() -> List[Model]: core_model_id=CoreModelId.llama3_1_8b, description="Llama 3.1 8b model", huggingface_repo="meta-llama/Llama-3.1-8B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -273,7 +250,6 @@ def llama3_1_base_models() -> List[Model]: core_model_id=CoreModelId.llama3_1_70b, description="Llama 3.1 70b model", huggingface_repo="meta-llama/Llama-3.1-70B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -293,7 +269,6 @@ def llama3_1_base_models() -> List[Model]: variant="bf16-mp8", description="Llama 3.1 405b model (BF16 weights)", huggingface_repo="meta-llama/Llama-3.1-405B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 16384, "n_layers": 126, @@ -313,7 +288,6 @@ def llama3_1_base_models() -> List[Model]: description="Llama 3.1 405b model (FP8 quantized)", huggingface_repo="meta-llama/Llama-3.1-405B-FP8", quantization_format=CheckpointQuantizationFormat.fp8_mixed, - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 16384, "n_layers": 126, @@ -333,7 +307,6 @@ def llama3_1_base_models() -> List[Model]: variant="bf16-mp16", description="Llama 3.1 405b model (BF16 weights for mp16)", huggingface_repo="meta-llama/Llama-3.1-405B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 16384, "n_layers": 126, @@ -357,7 +330,6 @@ def llama3_2_base_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_1b, description="Llama 3.2 1b model", huggingface_repo="meta-llama/Llama-3.2-1B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 2048, "n_layers": 16, @@ -376,7 +348,6 @@ def llama3_2_base_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_3b, description="Llama 3.2 3b model", huggingface_repo="meta-llama/Llama-3.2-3B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 3072, "n_layers": 28, @@ -395,7 +366,6 @@ def llama3_2_base_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_11b_vision, description="Llama 3.2 11b vision model", huggingface_repo="meta-llama/Llama-3.2-11B-Vision", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -417,7 +387,6 @@ def llama3_2_base_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_90b_vision, description="Llama 3.2 90b vision model", huggingface_repo="meta-llama/Llama-3.2-90B-Vision", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -444,7 +413,6 @@ def llama2_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama2_7b_chat, description="Llama 2 7b chat model", huggingface_repo="meta-llama/Llama-2-7b-chat", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -463,7 +431,6 @@ def llama2_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama2_13b_chat, description="Llama 2 13b chat model", huggingface_repo="meta-llama/Llama-2-13b-chat", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 5120, "n_layers": 40, @@ -482,7 +449,6 @@ def llama2_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama2_70b_chat, description="Llama 2 70b chat model", huggingface_repo="meta-llama/Llama-2-70b-chat", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -506,7 +472,6 @@ def llama3_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_8b_instruct, description="Llama 3 8b instruct model", huggingface_repo="meta-llama/Llama-3-8B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -525,7 +490,6 @@ def llama3_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_70b_instruct, description="Llama 3 70b instruct model", huggingface_repo="meta-llama/Llama-3-70B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -549,7 +513,6 @@ def llama3_1_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_1_8b_instruct, description="Llama 3.1 8b instruct model", huggingface_repo="meta-llama/Llama-3.1-8B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -568,7 +531,6 @@ def llama3_1_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_1_70b_instruct, description="Llama 3.1 70b instruct model", huggingface_repo="meta-llama/Llama-3.1-70B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -588,7 +550,6 @@ def llama3_1_instruct_models() -> List[Model]: variant="bf16-mp8", description="Llama 3.1 405b instruct model (BF16 weights)", huggingface_repo="meta-llama/Llama-3.1-405B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 16384, "n_layers": 126, @@ -608,7 +569,6 @@ def llama3_1_instruct_models() -> List[Model]: description="Llama 3.1 405b instruct model (FP8 quantized)", huggingface_repo="meta-llama/Llama-3.1-405B-Instruct-FP8", quantization_format=CheckpointQuantizationFormat.fp8_mixed, - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 16384, "n_layers": 126, @@ -628,7 +588,6 @@ def llama3_1_instruct_models() -> List[Model]: variant="bf16-mp16", description="Llama 3.1 405b instruct model (BF16 weights for mp16)", huggingface_repo="meta-llama/Llama-3.1-405B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 16384, "n_layers": 126, @@ -684,7 +643,6 @@ def llama3_2_quantized_models() -> List[Model]: quantization_format=CheckpointQuantizationFormat.int4, description="Llama 3.2 1b INT4 quantized LoRA", huggingface_repo="meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8", - recommended_sampling_params=recommended_sampling_params(), arch_args={ **arch_args_1b(), "quantization_args": { @@ -703,7 +661,6 @@ def llama3_2_quantized_models() -> List[Model]: quantization_format=CheckpointQuantizationFormat.int4, description="Llama 3.2 1b INT4 quantized SpinQuant", huggingface_repo="meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8", - recommended_sampling_params=recommended_sampling_params(), arch_args={ **arch_args_1b(), "quantization_args": { @@ -718,7 +675,6 @@ def llama3_2_quantized_models() -> List[Model]: quantization_format=CheckpointQuantizationFormat.int4, description="Llama 3.2 3b INT4 quantized LoRA", huggingface_repo="meta-llama/Llama-3.2-3B-Instruct-QLORA_INT4_EO8", - recommended_sampling_params=recommended_sampling_params(), arch_args={ **arch_args_3b(), "quantization_args": { @@ -737,7 +693,6 @@ def llama3_2_quantized_models() -> List[Model]: quantization_format=CheckpointQuantizationFormat.int4, description="Llama 3.2 3b INT4 quantized SpinQuant", huggingface_repo="meta-llama/Llama-3.2-3B-Instruct-SpinQuant_INT4_EO8", - recommended_sampling_params=recommended_sampling_params(), arch_args={ **arch_args_3b(), "quantization_args": { @@ -755,7 +710,6 @@ def llama3_2_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_1b_instruct, description="Llama 3.2 1b instruct model", huggingface_repo="meta-llama/Llama-3.2-1B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args=arch_args_1b(), pth_file_count=1, ), @@ -763,7 +717,6 @@ def llama3_2_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_3b_instruct, description="Llama 3.2 3b instruct model", huggingface_repo="meta-llama/Llama-3.2-3B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args=arch_args_3b(), pth_file_count=1, ), @@ -772,7 +725,6 @@ def llama3_2_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_11b_vision_instruct, description="Llama 3.2 11b vision instruct model", huggingface_repo="meta-llama/Llama-3.2-11B-Vision-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -794,7 +746,6 @@ def llama3_2_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_2_90b_vision_instruct, description="Llama 3.2 90b vision instruct model", huggingface_repo="meta-llama/Llama-3.2-90B-Vision-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -821,7 +772,6 @@ def llama3_3_instruct_models() -> List[Model]: core_model_id=CoreModelId.llama3_3_70b_instruct, description="Llama 3.3 70b instruct", huggingface_repo="meta-llama/Llama-3.3-70B-Instruct", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 8192, "n_layers": 80, @@ -846,7 +796,6 @@ def safety_models() -> List[Model]: core_model_id=CoreModelId.llama_guard_3_11b_vision, description="Llama Guard v3 11b vision system safety model", huggingface_repo="meta-llama/Llama-Guard-3-11B-Vision", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 4096, "n_layers": 32, @@ -870,7 +819,6 @@ def safety_models() -> List[Model]: description="Llama Guard v3 1b 'int4' quantized system safety model", huggingface_repo="meta-llama/Llama-Guard-3-1B-INT4", quantization_format=CheckpointQuantizationFormat.int4, - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 2048, "n_layers": 12, @@ -888,7 +836,6 @@ def safety_models() -> List[Model]: core_model_id=CoreModelId.llama_guard_3_1b, description="Llama Guard v3 1b system safety model", huggingface_repo="meta-llama/Llama-Guard-3-1B", - recommended_sampling_params=recommended_sampling_params(), arch_args={ "dim": 2048, "n_layers": 16, diff --git a/llama_stack/models/llama/sku_types.py b/llama_stack/models/llama/sku_types.py new file mode 100644 index 000000000..88799b66d --- /dev/null +++ b/llama_stack/models/llama/sku_types.py @@ -0,0 +1,229 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from enum import Enum +from typing import Any, Dict, Optional + +from pydantic import BaseModel, ConfigDict, Field + + +class CheckpointQuantizationFormat(Enum): + # default format + bf16 = "bf16" + + # used for enabling fp8_rowwise inference, some weights are bf16 + fp8_mixed = "fp8-mixed" + + int8 = "int8" + + int4 = "int4" + + +class ModelFamily(Enum): + llama2 = "llama2" + llama3 = "llama3" + llama3_1 = "llama3_1" + llama3_2 = "llama3_2" + llama3_3 = "llama3_3" + llama4 = "llama4" + safety = "safety" + + +class CoreModelId(Enum): + """Each of these models is a unique "SKU". These root models can be served in various garbs (especially by quantizing them)""" + + # Llama 2 family + llama2_7b = "Llama-2-7b" + llama2_13b = "Llama-2-13b" + llama2_70b = "Llama-2-70b" + llama2_7b_chat = "Llama-2-7b-chat" + llama2_13b_chat = "Llama-2-13b-chat" + llama2_70b_chat = "Llama-2-70b-chat" + + # Llama 3 family + llama3_8b = "Llama-3-8B" + llama3_70b = "Llama-3-70B" + llama3_8b_instruct = "Llama-3-8B-Instruct" + llama3_70b_instruct = "Llama-3-70B-Instruct" + + # Llama 3.1 family + llama3_1_8b = "Llama3.1-8B" + llama3_1_70b = "Llama3.1-70B" + llama3_1_405b = "Llama3.1-405B" + llama3_1_8b_instruct = "Llama3.1-8B-Instruct" + llama3_1_70b_instruct = "Llama3.1-70B-Instruct" + llama3_1_405b_instruct = "Llama3.1-405B-Instruct" + + # Llama 3.2 family + llama3_2_1b = "Llama3.2-1B" + llama3_2_3b = "Llama3.2-3B" + llama3_2_1b_instruct = "Llama3.2-1B-Instruct" + llama3_2_3b_instruct = "Llama3.2-3B-Instruct" + llama3_2_11b_vision = "Llama3.2-11B-Vision" + llama3_2_90b_vision = "Llama3.2-90B-Vision" + llama3_2_11b_vision_instruct = "Llama3.2-11B-Vision-Instruct" + llama3_2_90b_vision_instruct = "Llama3.2-90B-Vision-Instruct" + + # Llama 3.3 family + llama3_3_70b_instruct = "Llama3.3-70B-Instruct" + + # Llama 4 family + llama4_scout_17b_16e = "Llama-4-Scout-17B-16E" + llama4_scout_17b_16e_instruct = "Llama-4-Scout-17B-16E-Instruct" + llama4_maverick_17b_128e = "Llama-4-Maverick-17B-128E" + llama4_maverick_17b_128e_instruct = "Llama-4-Maverick-17B-128E-Instruct" + + # Safety models + llama_guard_3_8b = "Llama-Guard-3-8B" + llama_guard_2_8b = "Llama-Guard-2-8B" + llama_guard_3_11b_vision = "Llama-Guard-3-11B-Vision" + llama_guard_3_1b = "Llama-Guard-3-1B" + + +def is_multimodal(model_id) -> bool: + if model_id in [ + CoreModelId.llama3_2_11b_vision, + CoreModelId.llama3_2_90b_vision, + CoreModelId.llama3_2_11b_vision_instruct, + CoreModelId.llama3_2_90b_vision_instruct, + ]: + return True + else: + return False + + +def model_family(model_id) -> ModelFamily: + if model_id in [ + CoreModelId.llama2_7b, + CoreModelId.llama2_13b, + CoreModelId.llama2_70b, + CoreModelId.llama2_7b_chat, + CoreModelId.llama2_13b_chat, + CoreModelId.llama2_70b_chat, + ]: + return ModelFamily.llama2 + elif model_id in [ + CoreModelId.llama3_8b, + CoreModelId.llama3_70b, + CoreModelId.llama3_8b_instruct, + CoreModelId.llama3_70b_instruct, + ]: + return ModelFamily.llama3 + elif model_id in [ + CoreModelId.llama3_1_8b, + CoreModelId.llama3_1_70b, + CoreModelId.llama3_1_405b, + CoreModelId.llama3_1_8b_instruct, + CoreModelId.llama3_1_70b_instruct, + CoreModelId.llama3_1_405b_instruct, + ]: + return ModelFamily.llama3_1 + elif model_id in [ + CoreModelId.llama3_2_1b, + CoreModelId.llama3_2_3b, + CoreModelId.llama3_2_1b_instruct, + CoreModelId.llama3_2_3b_instruct, + CoreModelId.llama3_2_11b_vision, + CoreModelId.llama3_2_90b_vision, + CoreModelId.llama3_2_11b_vision_instruct, + CoreModelId.llama3_2_90b_vision_instruct, + ]: + return ModelFamily.llama3_2 + elif model_id in [ + CoreModelId.llama3_3_70b_instruct, + ]: + return ModelFamily.llama3_3 + elif model_id in [ + CoreModelId.llama4_scout_17b_16e, + CoreModelId.llama4_scout_17b_16e_instruct, + CoreModelId.llama4_maverick_17b_128e, + CoreModelId.llama4_maverick_17b_128e_instruct, + ]: + return ModelFamily.llama4 + elif model_id in [ + CoreModelId.llama_guard_3_8b, + CoreModelId.llama_guard_2_8b, + CoreModelId.llama_guard_3_11b_vision, + CoreModelId.llama_guard_3_1b, + ]: + return ModelFamily.safety + else: + raise ValueError(f"Unknown model family for {model_id}") + + +class Model(BaseModel): + core_model_id: CoreModelId + description: str + huggingface_repo: Optional[str] = None + arch_args: Dict[str, Any] + variant: str = "" + + quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16 + pth_file_count: int + metadata: Dict[str, Any] = Field(default_factory=dict) + + # silence pydantic until we remove the `model_` fields + model_config = ConfigDict(protected_namespaces=()) + + @property + def model_family(self) -> ModelFamily: + return model_family(self.core_model_id) + + # The SKU is uniquely identified by (model_id, variant) combo + def descriptor(self, shorten_default_variant: bool = True) -> str: + if not self.variant: + return self.core_model_id.value + return f"{self.core_model_id.value}:{self.variant}" + + @property + def is_instruct_model(self) -> bool: + return "instruct" in self.core_model_id.value + + # Featured models are shown in the non-exhaustive model list + @property + def is_featured(self) -> bool: + return self.model_family in [ + ModelFamily.llama3_1, + ModelFamily.llama3_2, + ModelFamily.llama3_3, + ModelFamily.llama4, + ModelFamily.safety, + ] + + @property + def max_seq_length(self) -> int: + if self.model_family == ModelFamily.llama2: + return 4096 + elif self.core_model_id == CoreModelId.llama_guard_2_8b: + return 4096 + elif self.model_family == ModelFamily.llama3: + return 8192 + elif self.model_family in [ModelFamily.llama3_1, ModelFamily.llama3_3]: + return 131072 + elif self.model_family == ModelFamily.llama3_2: + if self.quantization_format == CheckpointQuantizationFormat.int4: + return 8192 + return 131072 + elif self.model_family == ModelFamily.llama4: + if self.core_model_id in { + CoreModelId.llama4_scout_17b_16e, + CoreModelId.llama4_maverick_17b_128e, + }: + return 262144 + if self.core_model_id == CoreModelId.llama4_scout_17b_16e_instruct: + return 10485760 + if self.core_model_id == CoreModelId.llama4_maverick_17b_128e_instruct: + return 1048576 + + raise AssertionError(f"Unexpected core model id: {self.core_model_id}") + elif self.core_model_id in [ + CoreModelId.llama_guard_3_8b, + CoreModelId.llama_guard_3_11b_vision, + CoreModelId.llama_guard_3_1b, + ]: + return 131072 + else: + raise ValueError(f"Unknown max_seq_len for {self.core_model_id}") diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index e1af4ab71..6840da89f 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -52,6 +52,7 @@ from llama_stack.apis.inference import ( StopReason, SystemMessage, ToolDefinition, + ToolParamDefinition, ToolResponse, ToolResponseMessage, UserMessage, @@ -63,7 +64,6 @@ from llama_stack.log import get_logger from llama_stack.models.llama.datatypes import ( BuiltinTool, ToolCall, - ToolParamDefinition, ) from llama_stack.providers.utils.kvstore import KVStore from llama_stack.providers.utils.telemetry import tracing diff --git a/llama_stack/providers/inline/inference/meta_reference/__init__.py b/llama_stack/providers/inline/inference/meta_reference/__init__.py index 3ef7cfd45..3710766e2 100644 --- a/llama_stack/providers/inline/inference/meta_reference/__init__.py +++ b/llama_stack/providers/inline/inference/meta_reference/__init__.py @@ -4,13 +4,13 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Any, Dict, Union +from typing import Any, Dict -from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig +from .config import MetaReferenceInferenceConfig async def get_provider_impl( - config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig], + config: MetaReferenceInferenceConfig, _deps: Dict[str, Any], ): from .inference import MetaReferenceInferenceImpl diff --git a/llama_stack/providers/inline/inference/meta_reference/common.py b/llama_stack/providers/inline/inference/meta_reference/common.py index 3dc5e89f9..beb0d39d4 100644 --- a/llama_stack/providers/inline/inference/meta_reference/common.py +++ b/llama_stack/providers/inline/inference/meta_reference/common.py @@ -5,19 +5,10 @@ # the root directory of this source tree. from pathlib import Path -from typing import List, Optional - -from pydantic import BaseModel from llama_stack.distribution.utils.model_utils import model_local_dir -class TokenResult(BaseModel): - token: int - text: str - logprobs: Optional[List[float]] = None - - def model_checkpoint_dir(model_id) -> str: checkpoint_dir = Path(model_local_dir(model_id)) diff --git a/llama_stack/providers/inline/inference/meta_reference/config.py b/llama_stack/providers/inline/inference/meta_reference/config.py index 9e5f7747e..315667506 100644 --- a/llama_stack/providers/inline/inference/meta_reference/config.py +++ b/llama_stack/providers/inline/inference/meta_reference/config.py @@ -21,6 +21,7 @@ class MetaReferenceInferenceConfig(BaseModel): torch_seed: Optional[int] = None max_seq_len: int = 4096 max_batch_size: int = 1 + model_parallel_size: Optional[int] = None # when this is False, we assume that the distributed process group is setup by someone # outside of this code (e.g., when run inside `torchrun`). that is useful for clients @@ -31,6 +32,8 @@ class MetaReferenceInferenceConfig(BaseModel): # can override by specifying the directory explicitly checkpoint_dir: Optional[str] = None + quantization: Optional[QuantizationConfig] = None + @field_validator("model") @classmethod def validate_model(cls, model: str) -> str: @@ -47,27 +50,16 @@ class MetaReferenceInferenceConfig(BaseModel): cls, model: str = "Llama3.2-3B-Instruct", checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}", + quantization_type: str = "${env.QUANTIZATION_TYPE:bf16}", + model_parallel_size: str = "${env.MODEL_PARALLEL_SIZE:0}", **kwargs, ) -> Dict[str, Any]: return { "model": model, "max_seq_len": 4096, "checkpoint_dir": checkpoint_dir, + "quantization": { + "type": quantization_type, + }, + "model_parallel_size": model_parallel_size, } - - -class MetaReferenceQuantizedInferenceConfig(MetaReferenceInferenceConfig): - quantization: QuantizationConfig - - @classmethod - def sample_run_config( - cls, - model: str = "Llama3.2-3B-Instruct", - checkpoint_dir: str = "${env.CHECKPOINT_DIR:null}", - **kwargs, - ) -> Dict[str, Any]: - config = super().sample_run_config(model, checkpoint_dir, **kwargs) - config["quantization"] = { - "type": "fp8", - } - return config diff --git a/llama_stack/providers/inline/inference/meta_reference/generators.py b/llama_stack/providers/inline/inference/meta_reference/generators.py index 4b0ed7ecd..65bed4d8c 100644 --- a/llama_stack/providers/inline/inference/meta_reference/generators.py +++ b/llama_stack/providers/inline/inference/meta_reference/generators.py @@ -11,19 +11,18 @@ import torch from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData from llama_stack.apis.inference import ( - Fp8QuantizationConfig, - Int4QuantizationConfig, + GreedySamplingStrategy, JsonSchemaResponseFormat, ResponseFormat, -) -from llama_stack.models.llama.datatypes import ( - GreedySamplingStrategy, - Model, SamplingParams, TopPSamplingStrategy, ) +from llama_stack.models.llama.datatypes import QuantizationMode +from llama_stack.models.llama.llama3.generation import Llama3 from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer +from llama_stack.models.llama.llama4.generation import Llama4 from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer +from llama_stack.models.llama.sku_types import Model from llama_stack.providers.utils.inference.prompt_adapter import ( ChatCompletionRequestWithRawContent, CompletionRequestWithRawContent, @@ -31,10 +30,8 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( ) from .common import model_checkpoint_dir -from .config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig +from .config import MetaReferenceInferenceConfig from .inference import resolve_model -from .llama3.generation import Llama3 -from .llama4.generation import Llama4 Tokenizer = Llama4Tokenizer | Llama3Tokenizer @@ -116,10 +113,11 @@ def _infer_tool_prompt_format(request: ChatCompletionRequestWithRawContent): return get_default_tool_prompt_format(request.model) +# TODO: combine Llama3 and Llama4 generators since they are almost identical now class Llama4Generator: def __init__( self, - config: MetaReferenceInferenceConfig | MetaReferenceQuantizedInferenceConfig, + config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model, ): @@ -134,11 +132,13 @@ class Llama4Generator: # if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value ckpt_dir = model_checkpoint_dir(resolved_model.descriptor()) - if isinstance(config, MetaReferenceQuantizedInferenceConfig): - if isinstance(config.quantization, Fp8QuantizationConfig): - quantization_mode = "fp8_mixed" - elif isinstance(config.quantization, Int4QuantizationConfig): - quantization_mode = "int4_mixed" + if config.quantization: + if config.quantization.type == "fp8_mixed": + quantization_mode = QuantizationMode.fp8_mixed + elif config.quantization.type == "int4_mixed": + quantization_mode = QuantizationMode.int4_mixed + elif config.quantization.type == "bf16": + quantization_mode = None else: raise ValueError(f"Unsupported quantization mode {config.quantization}") else: @@ -148,7 +148,7 @@ class Llama4Generator: ckpt_dir=ckpt_dir, max_seq_len=config.max_seq_len, max_batch_size=config.max_batch_size, - world_size=llama_model.pth_file_count, + world_size=config.model_parallel_size or llama_model.pth_file_count, quantization_mode=quantization_mode, ) @@ -166,8 +166,8 @@ class Llama4Generator: max_gen_len = self.args.max_seq_len - 1 temperature, top_p = _infer_sampling_params(sampling_params) - yield from self.inner_generator.generate( - llm_input=self.formatter.encode_content(request.content), + for result in self.inner_generator.generate( + llm_inputs=[self.formatter.encode_content(request.content)], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, @@ -178,7 +178,8 @@ class Llama4Generator: self.args.vocab_size, request.response_format, ), - ) + ): + yield result[0] def chat_completion( self, @@ -190,8 +191,8 @@ class Llama4Generator: max_gen_len = self.args.max_seq_len - 1 temperature, top_p = _infer_sampling_params(sampling_params) - yield from self.inner_generator.generate( - llm_input=self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request)), + for result in self.inner_generator.generate( + llm_inputs=[self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, @@ -202,20 +203,46 @@ class Llama4Generator: self.args.vocab_size, request.response_format, ), - ) + ): + yield result[0] class Llama3Generator: def __init__( self, - config: MetaReferenceInferenceConfig | MetaReferenceQuantizedInferenceConfig, + config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model, ): + if config.checkpoint_dir and config.checkpoint_dir != "null": + ckpt_dir = config.checkpoint_dir + else: + resolved_model = resolve_model(model_id) + if resolved_model is None: + # if the model is not a native llama model, get the default checkpoint_dir based on model id + ckpt_dir = model_checkpoint_dir(model_id) + else: + # if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value + ckpt_dir = model_checkpoint_dir(resolved_model.descriptor()) + + if config.quantization: + if config.quantization.type == "fp8_mixed": + quantization_mode = QuantizationMode.fp8_mixed + elif config.quantization.type == "int4_mixed": + quantization_mode = QuantizationMode.int4_mixed + elif config.quantization.type == "bf16": + quantization_mode = None + else: + raise ValueError(f"Unsupported quantization mode {config.quantization}") + else: + quantization_mode = None + self.inner_generator = Llama3.build( - config=config, - model_id=model_id, - llama_model=llama_model, + ckpt_dir=ckpt_dir, + max_seq_len=config.max_seq_len, + max_batch_size=config.max_batch_size, + world_size=config.model_parallel_size or llama_model.pth_file_count, + quantization_mode=quantization_mode, ) self.tokenizer = self.inner_generator.tokenizer self.args = self.inner_generator.args @@ -231,8 +258,8 @@ class Llama3Generator: max_gen_len = self.args.max_seq_len - 1 temperature, top_p = _infer_sampling_params(sampling_params) - yield from self.inner_generator.generate( - model_input=self.formatter.encode_content(request.content), + for result in self.inner_generator.generate( + llm_inputs=[self.formatter.encode_content(request.content)], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, @@ -243,7 +270,8 @@ class Llama3Generator: self.args.vocab_size, request.response_format, ), - ) + ): + yield result[0] def chat_completion( self, @@ -255,8 +283,8 @@ class Llama3Generator: max_gen_len = self.args.max_seq_len - 1 temperature, top_p = _infer_sampling_params(sampling_params) - yield from self.inner_generator.generate( - model_input=self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request)), + for result in self.inner_generator.generate( + llm_inputs=[self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, @@ -267,4 +295,5 @@ class Llama3Generator: self.args.vocab_size, request.response_format, ), - ) + ): + yield result[0] diff --git a/llama_stack/providers/inline/inference/meta_reference/inference.py b/llama_stack/providers/inline/inference/meta_reference/inference.py index da217728b..5f81d6421 100644 --- a/llama_stack/providers/inline/inference/meta_reference/inference.py +++ b/llama_stack/providers/inline/inference/meta_reference/inference.py @@ -31,23 +31,21 @@ from llama_stack.apis.inference import ( LogProbConfig, Message, ResponseFormat, + SamplingParams, + StopReason, TokenLogProbs, ToolChoice, ToolConfig, -) -from llama_stack.apis.models import Model, ModelType -from llama_stack.models.llama.datatypes import ( - ModelFamily, - SamplingParams, - StopReason, ToolDefinition, ToolPromptFormat, ) +from llama_stack.apis.models import Model, ModelType from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer from llama_stack.models.llama.sku_list import resolve_model +from llama_stack.models.llama.sku_types import ModelFamily from llama_stack.providers.datatypes import ModelsProtocolPrivate from llama_stack.providers.utils.inference.embedding_mixin import ( SentenceTransformerEmbeddingMixin, @@ -151,7 +149,7 @@ class MetaReferenceInferenceImpl( if self.config.create_distributed_process_group: self.generator = LlamaModelParallelGenerator( - model_parallel_size=llama_model.pth_file_count, + model_parallel_size=self.config.model_parallel_size or llama_model.pth_file_count, builder_fn=builder_fn, builder_params=builder_params, formatter=( diff --git a/llama_stack/providers/inline/inference/meta_reference/llama3/generation.py b/llama_stack/providers/inline/inference/meta_reference/llama3/generation.py deleted file mode 100644 index 3805e4310..000000000 --- a/llama_stack/providers/inline/inference/meta_reference/llama3/generation.py +++ /dev/null @@ -1,346 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -import json -import os -import sys -import time -from pathlib import Path -from typing import Callable, Generator, Optional, Union - -import torch -import torch.nn.functional as F -from fairscale.nn.model_parallel.initialize import ( - get_model_parallel_rank, - initialize_model_parallel, - model_parallel_is_initialized, -) - -from llama_stack.apis.inference import ( - Fp8QuantizationConfig, - Int4QuantizationConfig, -) -from llama_stack.log import get_logger -from llama_stack.models.llama.datatypes import Model -from llama_stack.models.llama.llama3.chat_format import ChatFormat, LLMInput -from llama_stack.models.llama.llama3.tokenizer import Tokenizer -from llama_stack.models.llama.sku_list import resolve_model - -from ..common import TokenResult, model_checkpoint_dir -from ..config import MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig -from .args import ModelArgs -from .model import Transformer -from .multimodal.model import CrossAttentionTransformer - -log = get_logger(__name__, category="inference") - - -class Llama3: - @staticmethod - def build( - config: Union[MetaReferenceInferenceConfig, MetaReferenceQuantizedInferenceConfig], - model_id: str, - llama_model: Model, - ): - """ - Build a Llama instance by initializing and loading a model checkpoint. - - Note: - This method initializes the distributed process group, sets the device to CUDA, - and loads the pre-trained model and tokenizer. - """ - if "DEVICE" in os.environ: - device = os.environ.get("DEVICE") - if device == "cuda": - assert torch.cuda.is_available(), "PyTorch CUDA backend not available" - if device == "xpu": - assert torch.xpu.is_available(), "PyTorch XPU backend not available" - else: - if torch.cuda.is_available(): - device = "cuda" - elif torch.xpu.is_available(): - device = "xpu" - else: - device = "cpu" - log.info(f"Using {device} device") - - llama_model_id = llama_model.core_model_id.value - if not torch.distributed.is_initialized(): - if device == "cuda": - torch.distributed.init_process_group("nccl") - else: - torch.distributed.init_process_group("gloo") - - model_parallel_size = llama_model.pth_file_count - - if not model_parallel_is_initialized(): - initialize_model_parallel(model_parallel_size) - - local_rank = int(os.environ.get("LOCAL_RANK", 0)) - if device == "cuda": - torch.cuda.set_device(local_rank) - elif device == "xpu": - torch.xpu.set_device(local_rank) - - # seed must be the same in all processes - if config.torch_seed is not None: - torch.manual_seed(config.torch_seed) - - if local_rank > 0: - sys.stdout = open(os.devnull, "w") - - start_time = time.time() - if config.checkpoint_dir and config.checkpoint_dir != "null": - ckpt_dir = config.checkpoint_dir - else: - resolved_model = resolve_model(model_id) - if resolved_model is None: - # if the model is not a native llama model, get the default checkpoint_dir based on model id - ckpt_dir = model_checkpoint_dir(model_id) - else: - # if the model is a native llama model, get the default checkpoint_dir based on model core_model_id value - ckpt_dir = model_checkpoint_dir(resolved_model.descriptor()) - - checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) - assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}" - assert model_parallel_size == len(checkpoints), ( - f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}" - ) - ckpt_path = checkpoints[get_model_parallel_rank()] - state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True) - with open(Path(ckpt_dir) / "params.json", "r") as f: - params = json.loads(f.read()) - - if "model" in params: - params = params["model"] - - model_args: ModelArgs = ModelArgs( - max_seq_len=config.max_seq_len, - max_batch_size=config.max_batch_size, - **params, - ) - - tokenizer = Tokenizer.get_instance() - assert model_args.vocab_size == tokenizer.n_words, ( - f"model_args vocab = {model_args.vocab_size} but tokenizer vocab = {tokenizer.n_words}" - ) - - if isinstance(config, MetaReferenceQuantizedInferenceConfig): - if isinstance(config.quantization, Fp8QuantizationConfig): - from .quantization.loader import convert_to_fp8_quantized_model - - # load on CPU in bf16 so that fp8 conversion does not find an - # unexpected (fp32, e.g.) datatype - torch.set_default_tensor_type(torch.BFloat16Tensor) - if model_args.vision_chunk_size > 0: - model = CrossAttentionTransformer(model_args) - model.setup_cache(model_args.max_batch_size, torch.bfloat16) - else: - model = Transformer(model_args) - model.load_state_dict(state_dict, strict=False) - model = convert_to_fp8_quantized_model(model, config, ckpt_dir) - elif isinstance(config.quantization, Int4QuantizationConfig): - from .quantization.loader import convert_to_int4_quantized_model - - model = Transformer(model_args) - model = convert_to_int4_quantized_model(model, model_args, config) - model.load_state_dict(state_dict, strict=True) - - if model_args.quantization_args is not None and model_args.quantization_args.spinquant: - # Add a wrapper for adding hadamard transform for spinquant. - # This needs to be done after loading the state dict otherwise an error will be raised while - # loading the state dict. - from ..hadamard_utils import ( - add_hadamard_transform_for_spinquant, - ) - - add_hadamard_transform_for_spinquant(model) - else: - raise NotImplementedError("Currently int4 and fp8 are the only supported quantization methods.") - else: - if device == "cuda": - if torch.cuda.is_bf16_supported(): - torch.set_default_tensor_type(torch.cuda.BFloat16Tensor) - else: - torch.set_default_tensor_type(torch.cuda.HalfTensor) - else: - torch.set_default_device(device) - if device == "xpu" and torch.xpu.is_bf16_supported(): - torch.set_default_dtype(torch.bfloat16) - else: - torch.set_default_dtype(torch.half) - if model_args.vision_chunk_size > 0: - model = CrossAttentionTransformer(model_args) - model.setup_cache(model_args.max_batch_size, torch.bfloat16) - else: - model = Transformer(model_args) - model.load_state_dict(state_dict, strict=False) - - model.to(device) - - log.info(f"Loaded in {time.time() - start_time:.2f} seconds") - return Llama3(model, tokenizer, model_args, llama_model_id) - - def __init__( - self, - model: Transformer, - tokenizer: Tokenizer, - args: ModelArgs, - llama_model: str, - ): - self.args = args - self.model = model - self.tokenizer = tokenizer - self.formatter = ChatFormat(tokenizer) - self.llama_model = llama_model - - @torch.inference_mode() - def generate( - self, - model_input: LLMInput, - max_gen_len: int, - temperature: float = 0.6, - top_p: float = 0.9, - logprobs: bool = False, - echo: bool = False, - print_input_tokens: bool = False, - logits_processor: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, - ) -> Generator: - params = self.model.params - - if print_input_tokens: - input_tokens = [self.formatter.vision_token if t == 128256 else t for t in model_input.tokens] - log.info("Input to model -> " + self.tokenizer.decode(input_tokens)) - prompt_tokens = [model_input.tokens] - - bsz = 1 - assert bsz <= params.max_batch_size, (bsz, params.max_batch_size) - - min_prompt_len = min(len(t) for t in prompt_tokens) - max_prompt_len = max(len(t) for t in prompt_tokens) - - if max_prompt_len >= params.max_seq_len: - log.error(f"Out of token budget {max_prompt_len} vs {params.max_seq_len}") - return - - total_len = min(max_gen_len + max_prompt_len, params.max_seq_len) - - is_vision = isinstance(self.model, CrossAttentionTransformer) - if is_vision: - images = model_input.vision.images if model_input.vision is not None else [] - mask = model_input.vision.mask if model_input.vision is not None else [] - - # the method works for bsz > 1 so add a batch dimension - xattn_caches, cross_attention_masks, full_text_row_masked_out_mask = self.model.compute_vision_tokens_masks( - batch_images=[images], - batch_masks=[mask], - total_len=total_len, - ) - - pad_id = self.tokenizer.pad_id - tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long) - for k, t in enumerate(prompt_tokens): - tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long) - if logprobs: - token_logprobs = torch.zeros_like(tokens) - - prev_pos = 0 - eos_reached = torch.tensor([False] * bsz) - input_text_mask = tokens != pad_id - if min_prompt_len == total_len: - # TODO(ashwin): unify this branch with the one below and figure out multimodal crap - logits = self.model.forward(tokens, prev_pos) - token_logprobs = -F.cross_entropy( - input=logits.transpose(1, 2), - target=tokens, - reduction="none", - ignore_index=pad_id, - ) - - stop_tokens = torch.tensor(self.tokenizer.stop_tokens) - for cur_pos in range(min_prompt_len, total_len): - if is_vision: - position_ids = torch.arange(prev_pos, cur_pos, dtype=torch.long) - logits = self.model.forward( - position_ids, - tokens, - cross_attention_masks, - full_text_row_masked_out_mask, - xattn_caches, - ) - else: - logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos) - - if logits_processor is not None: - logits = logits_processor(tokens[:, :cur_pos], logits) - - if temperature > 0: - probs = torch.softmax(logits[:, -1] / temperature, dim=-1) - next_token = sample_top_p(probs, top_p) - else: - next_token = torch.argmax(logits[:, -1], dim=-1) - - next_token = next_token.reshape(-1) - # only replace token if prompt has already been generated - next_token = torch.where(input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token) - tokens[:, cur_pos] = next_token - - target = tokens[:, prev_pos + 1 : cur_pos + 1] - if is_vision: - # the logits space (num_classes) is designed to never contain a media_token - # however our input token stream does contain them. we need to nuke them here - # or else the CUDA kernels will crash with an illegal memory access - vision_tokens = [self.tokenizer.special_tokens["<|image|>"], 128256] - masks = [target.eq(t) for t in vision_tokens] - if len(masks) > 1: - mask = torch.logical_or(*masks) - else: - mask = masks[0] - target[mask] = 0 - - if logprobs: - token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy( - input=logits.transpose(1, 2), - target=tokens[:, prev_pos + 1 : cur_pos + 1], - reduction="none", - ignore_index=pad_id, - ) - eos_reached |= (~input_text_mask[:, cur_pos]) & (torch.isin(next_token, stop_tokens)) - yield TokenResult( - token=next_token[0].item(), - text=self.tokenizer.decode(next_token.tolist()), - logprobs=(token_logprobs[:, cur_pos : cur_pos + 1][0].tolist() if logprobs else None), - ) - - prev_pos = cur_pos - if all(eos_reached): - break - - -def sample_top_p(probs, p): - """ - Perform top-p (nucleus) sampling on a probability distribution. - - Args: - probs (torch.Tensor): Probability distribution tensor. - p (float): Probability threshold for top-p sampling. - - Returns: - torch.Tensor: Sampled token indices. - - Note: - Top-p sampling selects the smallest set of tokens whose cumulative probability mass - exceeds the threshold p. The distribution is renormalized based on the selected tokens. - """ - probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) - probs_sum = torch.cumsum(probs_sort, dim=-1) - mask = probs_sum - probs_sort > p - probs_sort[mask] = 0.0 - probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) - next_token = torch.multinomial(probs_sort, num_samples=1) - next_token = torch.gather(probs_idx, -1, next_token) - return next_token diff --git a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py index e8767c2ff..74fc49d5e 100644 --- a/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +++ b/llama_stack/providers/inline/inference/meta_reference/parallel_utils.py @@ -32,13 +32,12 @@ from pydantic import BaseModel, Field from torch.distributed.launcher.api import LaunchConfig, elastic_launch from typing_extensions import Annotated +from llama_stack.models.llama.datatypes import GenerationResult from llama_stack.providers.utils.inference.prompt_adapter import ( ChatCompletionRequestWithRawContent, CompletionRequestWithRawContent, ) -from .common import TokenResult - log = logging.getLogger(__name__) @@ -75,7 +74,7 @@ class TaskRequest(BaseModel): class TaskResponse(BaseModel): type: Literal[ProcessingMessageName.task_response] = ProcessingMessageName.task_response - result: TokenResult + result: GenerationResult class ExceptionResponse(BaseModel): diff --git a/llama_stack/providers/inline/inference/vllm/openai_utils.py b/llama_stack/providers/inline/inference/vllm/openai_utils.py index 90b5398f9..d34f5ad5f 100644 --- a/llama_stack/providers/inline/inference/vllm/openai_utils.py +++ b/llama_stack/providers/inline/inference/vllm/openai_utils.py @@ -14,9 +14,10 @@ from llama_stack.apis.inference import ( JsonSchemaResponseFormat, Message, ToolChoice, + ToolDefinition, UserMessage, ) -from llama_stack.models.llama.datatypes import BuiltinTool, ToolDefinition +from llama_stack.models.llama.datatypes import BuiltinTool from llama_stack.providers.utils.inference.openai_compat import ( convert_message_to_openai_dict, get_sampling_options, diff --git a/llama_stack/providers/inline/inference/vllm/vllm.py b/llama_stack/providers/inline/inference/vllm/vllm.py index 256e0f821..ea2643b7a 100644 --- a/llama_stack/providers/inline/inference/vllm/vllm.py +++ b/llama_stack/providers/inline/inference/vllm/vllm.py @@ -46,6 +46,8 @@ from llama_stack.apis.inference import ( TokenLogProbs, ToolChoice, ToolConfig, + TopKSamplingStrategy, + TopPSamplingStrategy, ) from llama_stack.apis.models import Model from llama_stack.log import get_logger @@ -55,8 +57,6 @@ from llama_stack.models.llama.datatypes import ( ToolCall, ToolDefinition, ToolPromptFormat, - TopKSamplingStrategy, - TopPSamplingStrategy, ) from llama_stack.models.llama.llama3.chat_format import ChatFormat from llama_stack.models.llama.llama3.tokenizer import Tokenizer diff --git a/llama_stack/providers/inline/post_training/torchtune/common/utils.py b/llama_stack/providers/inline/post_training/torchtune/common/utils.py index f8a1c0436..a040ca1b0 100644 --- a/llama_stack/providers/inline/post_training/torchtune/common/utils.py +++ b/llama_stack/providers/inline/post_training/torchtune/common/utils.py @@ -22,8 +22,8 @@ from torchtune.models.llama3_2 import lora_llama3_2_3b from torchtune.modules.transforms import Transform from llama_stack.apis.post_training import DatasetFormat -from llama_stack.models.llama.datatypes import Model from llama_stack.models.llama.sku_list import resolve_model +from llama_stack.models.llama.sku_types import Model BuildLoraModelCallable = Callable[..., torch.nn.Module] BuildTokenizerCallable = Callable[..., Llama3Tokenizer] diff --git a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py index e514e3781..d95c40976 100644 --- a/llama_stack/providers/inline/safety/llama_guard/llama_guard.py +++ b/llama_stack/providers/inline/safety/llama_guard/llama_guard.py @@ -23,7 +23,8 @@ from llama_stack.apis.safety import ( ) from llama_stack.apis.shields import Shield from llama_stack.distribution.datatypes import Api -from llama_stack.models.llama.datatypes import CoreModelId, Role +from llama_stack.models.llama.datatypes import Role +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.datatypes import ShieldsProtocolPrivate from llama_stack.providers.utils.inference.prompt_adapter import ( interleaved_content_as_str, diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 51ea4cbef..5f9ae421f 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -24,6 +24,8 @@ META_REFERENCE_DEPS = [ "zmq", "lm-format-enforcer", "sentence-transformers", + "torchao==0.5.0", + "fbgemm-gpu-genai==1.1.2", ] @@ -36,13 +38,6 @@ def available_providers() -> List[ProviderSpec]: module="llama_stack.providers.inline.inference.meta_reference", config_class="llama_stack.providers.inline.inference.meta_reference.MetaReferenceInferenceConfig", ), - InlineProviderSpec( - api=Api.inference, - provider_type="inline::meta-reference-quantized", - pip_packages=META_REFERENCE_DEPS + ["fbgemm-gpu", "torchao==0.5.0"], - module="llama_stack.providers.inline.inference.meta_reference", - config_class="llama_stack.providers.inline.inference.meta_reference.MetaReferenceQuantizedInferenceConfig", - ), InlineProviderSpec( api=Api.inference, provider_type="inline::vllm", diff --git a/llama_stack/providers/remote/inference/bedrock/models.py b/llama_stack/providers/remote/inference/bedrock/models.py index c5079799f..ec8120049 100644 --- a/llama_stack/providers/remote/inference/bedrock/models.py +++ b/llama_stack/providers/remote/inference/bedrock/models.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) diff --git a/llama_stack/providers/remote/inference/cerebras/cerebras.py b/llama_stack/providers/remote/inference/cerebras/cerebras.py index a53e6e5a5..43d986b86 100644 --- a/llama_stack/providers/remote/inference/cerebras/cerebras.py +++ b/llama_stack/providers/remote/inference/cerebras/cerebras.py @@ -28,8 +28,8 @@ from llama_stack.apis.inference import ( ToolConfig, ToolDefinition, ToolPromptFormat, + TopKSamplingStrategy, ) -from llama_stack.models.llama.datatypes import TopKSamplingStrategy from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) diff --git a/llama_stack/providers/remote/inference/cerebras/models.py b/llama_stack/providers/remote/inference/cerebras/models.py index 37419bf4c..38301b32a 100644 --- a/llama_stack/providers/remote/inference/cerebras/models.py +++ b/llama_stack/providers/remote/inference/cerebras/models.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) diff --git a/llama_stack/providers/remote/inference/databricks/databricks.py b/llama_stack/providers/remote/inference/databricks/databricks.py index 53a9c04f4..0eaf0135b 100644 --- a/llama_stack/providers/remote/inference/databricks/databricks.py +++ b/llama_stack/providers/remote/inference/databricks/databricks.py @@ -28,7 +28,7 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ) -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, build_hf_repo_model_entry, diff --git a/llama_stack/providers/remote/inference/fireworks/models.py b/llama_stack/providers/remote/inference/fireworks/models.py index a0dc11768..4975d061f 100644 --- a/llama_stack/providers/remote/inference/fireworks/models.py +++ b/llama_stack/providers/remote/inference/fireworks/models.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from llama_stack.apis.models.models import ModelType -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( ProviderModelEntry, build_hf_repo_model_entry, diff --git a/llama_stack/providers/remote/inference/nvidia/models.py b/llama_stack/providers/remote/inference/nvidia/models.py index 879855003..964125148 100644 --- a/llama_stack/providers/remote/inference/nvidia/models.py +++ b/llama_stack/providers/remote/inference/nvidia/models.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from llama_stack.apis.models import ModelType -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( ProviderModelEntry, build_hf_repo_model_entry, diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py index 5caf19fda..e1f5d7a6a 100644 --- a/llama_stack/providers/remote/inference/nvidia/nvidia.py +++ b/llama_stack/providers/remote/inference/nvidia/nvidia.py @@ -29,15 +29,13 @@ from llama_stack.apis.inference import ( LogProbConfig, Message, ResponseFormat, + SamplingParams, TextTruncation, ToolChoice, ToolConfig, -) -from llama_stack.models.llama.datatypes import ( - SamplingParams, ToolDefinition, - ToolPromptFormat, ) +from llama_stack.models.llama.datatypes import ToolPromptFormat from llama_stack.providers.utils.inference.model_registry import ( ModelRegistryHelper, ) diff --git a/llama_stack/providers/remote/inference/nvidia/openai_utils.py b/llama_stack/providers/remote/inference/nvidia/openai_utils.py index 0582cb816..3f2769b26 100644 --- a/llama_stack/providers/remote/inference/nvidia/openai_utils.py +++ b/llama_stack/providers/remote/inference/nvidia/openai_utils.py @@ -19,11 +19,9 @@ from llama_stack.apis.inference import ( CompletionRequest, CompletionResponse, CompletionResponseStreamChunk, + GreedySamplingStrategy, JsonSchemaResponseFormat, TokenLogProbs, -) -from llama_stack.models.llama.datatypes import ( - GreedySamplingStrategy, TopKSamplingStrategy, TopPSamplingStrategy, ) diff --git a/llama_stack/providers/remote/inference/ollama/models.py b/llama_stack/providers/remote/inference/ollama/models.py index be556762c..42e364105 100644 --- a/llama_stack/providers/remote/inference/ollama/models.py +++ b/llama_stack/providers/remote/inference/ollama/models.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from llama_stack.apis.models.models import ModelType -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( ProviderModelEntry, build_hf_repo_model_entry, diff --git a/llama_stack/providers/remote/inference/sambanova/models.py b/llama_stack/providers/remote/inference/sambanova/models.py index 2231be22d..9589ea268 100644 --- a/llama_stack/providers/remote/inference/sambanova/models.py +++ b/llama_stack/providers/remote/inference/sambanova/models.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( build_hf_repo_model_entry, ) diff --git a/llama_stack/providers/remote/inference/sambanova/sambanova.py b/llama_stack/providers/remote/inference/sambanova/sambanova.py index 635a42d38..a3badd468 100644 --- a/llama_stack/providers/remote/inference/sambanova/sambanova.py +++ b/llama_stack/providers/remote/inference/sambanova/sambanova.py @@ -21,6 +21,7 @@ from llama_stack.apis.inference import ( CompletionMessage, EmbeddingsResponse, EmbeddingTaskType, + GreedySamplingStrategy, Inference, LogProbConfig, Message, @@ -35,12 +36,9 @@ from llama_stack.apis.inference import ( ToolDefinition, ToolPromptFormat, ToolResponseMessage, - UserMessage, -) -from llama_stack.models.llama.datatypes import ( - GreedySamplingStrategy, TopKSamplingStrategy, TopPSamplingStrategy, + UserMessage, ) from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper from llama_stack.providers.utils.inference.openai_compat import ( diff --git a/llama_stack/providers/remote/inference/together/models.py b/llama_stack/providers/remote/inference/together/models.py index 63d3d94b5..f014c03f0 100644 --- a/llama_stack/providers/remote/inference/together/models.py +++ b/llama_stack/providers/remote/inference/together/models.py @@ -5,7 +5,7 @@ # the root directory of this source tree. from llama_stack.apis.models.models import ModelType -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( ProviderModelEntry, build_hf_repo_model_entry, diff --git a/llama_stack/providers/remote/post_training/nvidia/models.py b/llama_stack/providers/remote/post_training/nvidia/models.py index 04a9af38c..7c696ac20 100644 --- a/llama_stack/providers/remote/post_training/nvidia/models.py +++ b/llama_stack/providers/remote/post_training/nvidia/models.py @@ -6,7 +6,7 @@ from typing import List -from llama_stack.models.llama.datatypes import CoreModelId +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.utils.inference.model_registry import ( ProviderModelEntry, build_hf_repo_model_entry, diff --git a/llama_stack/providers/tests/report.py b/llama_stack/providers/tests/report.py index c9a7f69a8..bc29534be 100644 --- a/llama_stack/providers/tests/report.py +++ b/llama_stack/providers/tests/report.py @@ -12,8 +12,8 @@ import pytest from pytest import ExitCode from pytest_html.basereport import _process_outcome -from llama_stack.models.llama.datatypes import CoreModelId from llama_stack.models.llama.sku_list import all_registered_models +from llama_stack.models.llama.sku_types import CoreModelId INFERENCE_APIS = ["chat_completion"] FUNCTIONALITIES = ["streaming", "structured_output", "tool_calling"] diff --git a/llama_stack/providers/utils/inference/__init__.py b/llama_stack/providers/utils/inference/__init__.py index a885da235..e36be9404 100644 --- a/llama_stack/providers/utils/inference/__init__.py +++ b/llama_stack/providers/utils/inference/__init__.py @@ -6,8 +6,8 @@ from typing import List -from llama_stack.models.llama.datatypes import * # noqa: F403 from llama_stack.models.llama.sku_list import all_registered_models +from llama_stack.models.llama.sku_types import * # noqa: F403 def is_supported_safety_model(model: Model) -> bool: diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index e475d77b6..44a89dfb0 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -73,21 +73,21 @@ from llama_stack.apis.inference import ( CompletionMessage, CompletionResponse, CompletionResponseStreamChunk, + GreedySamplingStrategy, Message, + SamplingParams, SystemMessage, TokenLogProbs, ToolResponseMessage, + TopKSamplingStrategy, + TopPSamplingStrategy, UserMessage, ) from llama_stack.models.llama.datatypes import ( BuiltinTool, - GreedySamplingStrategy, - SamplingParams, StopReason, ToolCall, ToolDefinition, - TopKSamplingStrategy, - TopPSamplingStrategy, ) from llama_stack.providers.utils.inference.prompt_adapter import ( convert_image_content_to_url, diff --git a/llama_stack/providers/utils/inference/prompt_adapter.py b/llama_stack/providers/utils/inference/prompt_adapter.py index 0231312cc..4f9c4927a 100644 --- a/llama_stack/providers/utils/inference/prompt_adapter.py +++ b/llama_stack/providers/utils/inference/prompt_adapter.py @@ -34,7 +34,6 @@ from llama_stack.apis.inference import ( ) from llama_stack.log import get_logger from llama_stack.models.llama.datatypes import ( - ModelFamily, RawContent, RawContentItem, RawMediaItem, @@ -43,7 +42,6 @@ from llama_stack.models.llama.datatypes import ( Role, StopReason, ToolPromptFormat, - is_multimodal, ) from llama_stack.models.llama.llama3.chat_format import ChatFormat from llama_stack.models.llama.llama3.prompt_templates import ( @@ -55,6 +53,7 @@ from llama_stack.models.llama.llama3.prompt_templates import ( ) from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack.models.llama.sku_list import resolve_model +from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal from llama_stack.providers.utils.inference import supported_inference_models log = get_logger(name=__name__, category="inference") diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json index 931240d37..b8f475cea 100644 --- a/llama_stack/templates/dependencies.json +++ b/llama_stack/templates/dependencies.json @@ -356,50 +356,7 @@ "fairscale", "faiss-cpu", "fastapi", - "fire", - "httpx", - "langdetect", - "lm-format-enforcer", - "matplotlib", - "mcp", - "nltk", - "numpy", - "openai", - "opentelemetry-exporter-otlp-proto-http", - "opentelemetry-sdk", - "pandas", - "pillow", - "psycopg2-binary", - "pymongo", - "pypdf", - "pythainlp", - "redis", - "requests", - "scikit-learn", - "scipy", - "sentence-transformers", - "sentencepiece", - "torch", - "torchvision", - "tqdm", - "transformers", - "tree_sitter", - "uvicorn", - "zmq" - ], - "meta-reference-quantized-gpu": [ - "accelerate", - "aiosqlite", - "autoevals", - "blobfile", - "chardet", - "chromadb-client", - "datasets", - "emoji", - "fairscale", - "faiss-cpu", - "fastapi", - "fbgemm-gpu", + "fbgemm-gpu-genai==1.1.2", "fire", "httpx", "langdetect", diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 2cf49cc36..9f97158f8 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -18,6 +18,9 @@ providers: model: ${env.INFERENCE_MODEL} max_seq_len: 4096 checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} + quantization: + type: ${env.QUANTIZATION_TYPE:bf16} + model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} @@ -27,6 +30,9 @@ providers: model: ${env.SAFETY_MODEL} max_seq_len: 4096 checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:null} + quantization: + type: ${env.QUANTIZATION_TYPE:bf16} + model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0} vector_io: - provider_id: faiss provider_type: inline::faiss diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 964dfafeb..eda332123 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -18,6 +18,9 @@ providers: model: ${env.INFERENCE_MODEL} max_seq_len: 4096 checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} + quantization: + type: ${env.QUANTIZATION_TYPE:bf16} + model_parallel_size: ${env.MODEL_PARALLEL_SIZE:0} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml deleted file mode 100644 index 7bbcfe5f2..000000000 --- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml +++ /dev/null @@ -1,32 +0,0 @@ -version: '2' -distribution_spec: - description: Use Meta Reference with fp8, int4 quantization for running LLM inference - providers: - inference: - - inline::meta-reference-quantized - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::code-interpreter - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md deleted file mode 100644 index 1855da6c9..000000000 --- a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -orphan: true ---- -# Meta Reference Quantized Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations: - -{{ providers_table }} - -The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc. - -Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs. - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - - -## Prerequisite: Downloading Models - -Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. - -``` -$ llama model list --downloaded -┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ -┃ Model ┃ Size ┃ Modified Time ┃ -┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ -│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │ -├─────────────────────────────────────────┼──────────┼─────────────────────┤ -│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │ -└─────────────────────────────────────────┴──────────┴─────────────────────┘ -``` - -## Running the Distribution - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -``` - -### Via Conda - -Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. - -```bash -llama stack build --template {{ name }} --image-type conda -llama stack run distributions/{{ name }}/run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -llama stack run distributions/{{ name }}/run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ - --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -``` diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py deleted file mode 100644 index c46ea8bc6..000000000 --- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models.models import ModelType -from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput -from llama_stack.providers.inline.inference.meta_reference import ( - MetaReferenceQuantizedInferenceConfig, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["inline::meta-reference-quantized"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::code-interpreter", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ToolGroupInput( - toolgroup_id="builtin::code_interpreter", - provider_id="code-interpreter", - ), - ] - name = "meta-reference-quantized-gpu" - inference_provider = Provider( - provider_id="meta-reference-inference", - provider_type="inline::meta-reference-quantized", - config=MetaReferenceQuantizedInferenceConfig.sample_run_config( - model="${env.INFERENCE_MODEL}", - checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", - ), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="meta-reference-inference", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use Meta Reference with fp8, int4 quantization for running LLM inference", - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model loaded into the Meta Reference server", - ), - "INFERENCE_CHECKPOINT_DIR": ( - "null", - "Directory containing the Meta Reference model checkpoint", - ), - }, - ) diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml deleted file mode 100644 index f934ecfbb..000000000 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ /dev/null @@ -1,134 +0,0 @@ -version: '2' -image_name: meta-reference-quantized-gpu -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: meta-reference-inference - provider_type: inline::meta-reference-quantized - config: - model: ${env.INFERENCE_MODEL} - max_seq_len: 4096 - checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:null} - quantization: - type: fp8 - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/agents_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:\u200B}" - sinks: ${env.TELEMETRY_SINKS:console,sqlite} - sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-quantized-gpu/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:} - max_results: 3 - - provider_id: code-interpreter - provider_type: inline::code-interpreter - config: {} - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/registry.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: meta-reference-inference - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::code_interpreter - provider_id: code-interpreter -server: - port: 8321 diff --git a/pyproject.toml b/pyproject.toml index 8d8ff4338..8ae7ddbb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -224,9 +224,9 @@ exclude = [ "^llama_stack/providers/inline/eval/meta_reference/eval\\.py$", "^llama_stack/providers/inline/inference/meta_reference/config\\.py$", "^llama_stack/providers/inline/inference/meta_reference/inference\\.py$", - "^llama_stack/providers/inline/inference/meta_reference/llama3/generation\\.py$", - "^llama_stack/providers/inline/inference/meta_reference/llama3/multimodal/model\\.py$", - "^llama_stack/providers/inline/inference/meta_reference/llama4/", + "^llama_stack/models/llama/llama3/generation\\.py$", + "^llama_stack/models/llama/llama3/multimodal/model\\.py$", + "^llama_stack/models/llama/llama4/", "^llama_stack/providers/inline/inference/meta_reference/parallel_utils\\.py$", "^llama_stack/providers/inline/inference/meta_reference/quantization/fp8_impls\\.py$", "^llama_stack/providers/inline/inference/meta_reference/quantization/loader\\.py$", diff --git a/scripts/generate_prompt_format.py b/scripts/generate_prompt_format.py index 08c5bea22..5598e35f6 100755 --- a/scripts/generate_prompt_format.py +++ b/scripts/generate_prompt_format.py @@ -5,13 +5,6 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# top-level folder for each specific model found within the models/ directory at -# the top-level of this source tree. - # Run this script: # torchrun --nproc_per_node=8 scripts/generate_prompt_format.py meta-llama/Llama-4-17B-Omni-Instruct-BF16-16E ~/.llama/checkpoints/Llama-4-17B-Omni-Instruct-BF16-16E/ llama_stack.models.llama.llama4.prompts llama_stack/models/llama/llama4/prompt_format.md @@ -22,16 +15,9 @@ from pathlib import Path import fire +from llama_stack.models.llama.llama3.generation import Llama3 +from llama_stack.models.llama.llama4.generation import Llama4 from llama_stack.models.llama.sku_list import resolve_model -from llama_stack.providers.inline.inference.meta_reference.config import ( - MetaReferenceInferenceConfig, -) -from llama_stack.providers.inline.inference.meta_reference.llama3.generation import ( - Llama3, -) -from llama_stack.providers.inline.inference.meta_reference.llama4.generation import ( - Llama4, -) THIS_DIR = Path(__file__).parent.resolve() @@ -50,24 +36,12 @@ def run_main( if not llama_model: raise ValueError(f"Model {model_id} not found") - if not llama4: - config = MetaReferenceInferenceConfig( - model=model_id, - max_seq_len=4096, - max_batch_size=1, - checkpoint_dir=checkpoint_dir, - ) - generator = Llama3.build( - config=config, - model_id=model_id, - llama_model=llama_model, - ) - else: - generator = Llama4.build( - ckpt_dir=checkpoint_dir, - max_seq_len=4096, - max_batch_size=1, - ) + cls = Llama4 if llama4 else Llama3 + generator = cls.build( + ckpt_dir=checkpoint_dir, + max_seq_len=4096, + max_batch_size=1, + ) use_cases = module.usecases() text = "" diff --git a/tests/integration/report.py b/tests/integration/report.py index c07338ce6..a50f51d3f 100644 --- a/tests/integration/report.py +++ b/tests/integration/report.py @@ -11,7 +11,6 @@ import pytest from pytest import CollectReport from termcolor import cprint -from llama_stack.models.llama.datatypes import CoreModelId from llama_stack.models.llama.sku_list import ( all_registered_models, llama3_1_instruct_models, @@ -20,6 +19,7 @@ from llama_stack.models.llama.sku_list import ( llama3_instruct_models, safety_models, ) +from llama_stack.models.llama.sku_types import CoreModelId from llama_stack.providers.datatypes import Api from .metadata import API_MAPS From 7b4eb0967e82eab17029f5064f15f753e36b12fc Mon Sep 17 00:00:00 2001 From: ehhuang Date: Mon, 7 Apr 2025 23:06:28 -0700 Subject: [PATCH 07/24] test: verification on provider's OAI endpoints (#1893) # What does this PR do? ## Test Plan export MODEL=accounts/fireworks/models/llama4-scout-instruct-basic; LLAMA_STACK_CONFIG=verification pytest -s -v tests/integration/inference --vision-model $MODEL --text-model $MODEL --- .../self_hosted_distro/fireworks.md | 2 + .../distributions/self_hosted_distro/groq.md | 2 + .../self_hosted_distro/sambanova.md | 1 + .../self_hosted_distro/together.md | 2 + llama_stack/providers/registry/inference.py | 50 ++ .../cerebras_openai_compat/__init__.py | 17 + .../cerebras_openai_compat/cerebras.py | 30 + .../cerebras_openai_compat/config.py | 38 ++ .../remote/inference/fireworks/models.py | 8 + .../fireworks_openai_compat/__init__.py | 17 + .../fireworks_openai_compat/config.py | 38 ++ .../fireworks_openai_compat/fireworks.py | 30 + .../providers/remote/inference/groq/models.py | 8 + .../inference/groq_openai_compat/__init__.py | 17 + .../inference/groq_openai_compat/config.py | 38 ++ .../inference/groq_openai_compat/groq.py | 30 + .../remote/inference/sambanova/models.py | 4 + .../sambanova_openai_compat/__init__.py | 17 + .../sambanova_openai_compat/config.py | 38 ++ .../sambanova_openai_compat/sambanova.py | 30 + .../remote/inference/together/models.py | 14 + .../remote/inference/together/together.py | 2 +- .../together_openai_compat/__init__.py | 17 + .../together_openai_compat/config.py | 38 ++ .../together_openai_compat/together.py | 30 + .../utils/inference/model_registry.py | 13 +- .../utils/inference/openai_compat.py | 32 +- llama_stack/templates/ci-tests/run.yaml | 20 + llama_stack/templates/dependencies.json | 39 ++ llama_stack/templates/dev/run.yaml | 40 ++ .../templates/fireworks/run-with-safety.yaml | 20 + llama_stack/templates/fireworks/run.yaml | 20 + llama_stack/templates/groq/run.yaml | 20 + llama_stack/templates/sambanova/run.yaml | 10 + .../templates/together/run-with-safety.yaml | 30 + llama_stack/templates/together/run.yaml | 30 + .../templates/verification/__init__.py | 7 + llama_stack/templates/verification/build.yaml | 38 ++ llama_stack/templates/verification/run.yaml | 626 ++++++++++++++++++ .../templates/verification/verification.py | 206 ++++++ .../inference/test_text_inference.py | 22 +- .../inference/test_vision_inference.py | 5 +- .../test_cases/inference/chat_completion.json | 4 +- 43 files changed, 1683 insertions(+), 17 deletions(-) create mode 100644 llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py create mode 100644 llama_stack/providers/remote/inference/cerebras_openai_compat/cerebras.py create mode 100644 llama_stack/providers/remote/inference/cerebras_openai_compat/config.py create mode 100644 llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py create mode 100644 llama_stack/providers/remote/inference/fireworks_openai_compat/config.py create mode 100644 llama_stack/providers/remote/inference/fireworks_openai_compat/fireworks.py create mode 100644 llama_stack/providers/remote/inference/groq_openai_compat/__init__.py create mode 100644 llama_stack/providers/remote/inference/groq_openai_compat/config.py create mode 100644 llama_stack/providers/remote/inference/groq_openai_compat/groq.py create mode 100644 llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py create mode 100644 llama_stack/providers/remote/inference/sambanova_openai_compat/config.py create mode 100644 llama_stack/providers/remote/inference/sambanova_openai_compat/sambanova.py create mode 100644 llama_stack/providers/remote/inference/together_openai_compat/__init__.py create mode 100644 llama_stack/providers/remote/inference/together_openai_compat/config.py create mode 100644 llama_stack/providers/remote/inference/together_openai_compat/together.py create mode 100644 llama_stack/templates/verification/__init__.py create mode 100644 llama_stack/templates/verification/build.yaml create mode 100644 llama_stack/templates/verification/run.yaml create mode 100644 llama_stack/templates/verification/verification.py diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index ee4bf0b25..ee9ddc818 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -46,6 +46,8 @@ The following models are available by default: - `accounts/fireworks/models/llama-v3p3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)` - `accounts/fireworks/models/llama-guard-3-8b (aliases: meta-llama/Llama-Guard-3-8B)` - `accounts/fireworks/models/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)` +- `accounts/fireworks/models/llama4-scout-instruct-basic (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` +- `accounts/fireworks/models/llama4-maverick-instruct-basic (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` - `nomic-ai/nomic-embed-text-v1.5 ` diff --git a/docs/source/distributions/self_hosted_distro/groq.md b/docs/source/distributions/self_hosted_distro/groq.md index fe922f23d..4f5a8a859 100644 --- a/docs/source/distributions/self_hosted_distro/groq.md +++ b/docs/source/distributions/self_hosted_distro/groq.md @@ -42,6 +42,8 @@ The following models are available by default: - `groq/llama3-70b-8192 (aliases: meta-llama/Llama-3-70B-Instruct)` - `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)` - `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)` +- `groq/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` +- `groq/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` ### Prerequisite: API Keys diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md index 1d2e0d9df..76b976d78 100644 --- a/docs/source/distributions/self_hosted_distro/sambanova.md +++ b/docs/source/distributions/self_hosted_distro/sambanova.md @@ -43,6 +43,7 @@ The following models are available by default: - `Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` - `Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` - `Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)` +- `Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` ### Prerequisite: API Keys diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index b07e85a1c..3ebb1f59e 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -48,6 +48,8 @@ The following models are available by default: - `meta-llama/Llama-Guard-3-11B-Vision-Turbo (aliases: meta-llama/Llama-Guard-3-11B-Vision)` - `togethercomputer/m2-bert-80M-8k-retrieval ` - `togethercomputer/m2-bert-80M-32k-retrieval ` +- `meta-llama/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct, together/meta-llama/Llama-4-Scout-17B-16E-Instruct)` +- `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct, together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)` ### Prerequisite: API Keys diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 5f9ae421f..aabb3bbdf 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -217,6 +217,56 @@ def available_providers() -> List[ProviderSpec]: provider_data_validator="llama_stack.providers.remote.inference.groq.config.GroqProviderDataValidator", ), ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="fireworks-openai-compat", + pip_packages=["litellm"], + module="llama_stack.providers.remote.inference.fireworks_openai_compat", + config_class="llama_stack.providers.remote.inference.fireworks_openai_compat.config.FireworksCompatConfig", + provider_data_validator="llama_stack.providers.remote.inference.fireworks_openai_compat.config.FireworksProviderDataValidator", + ), + ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="together-openai-compat", + pip_packages=["litellm"], + module="llama_stack.providers.remote.inference.together_openai_compat", + config_class="llama_stack.providers.remote.inference.together_openai_compat.config.TogetherCompatConfig", + provider_data_validator="llama_stack.providers.remote.inference.together_openai_compat.config.TogetherProviderDataValidator", + ), + ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="groq-openai-compat", + pip_packages=["litellm"], + module="llama_stack.providers.remote.inference.groq_openai_compat", + config_class="llama_stack.providers.remote.inference.groq_openai_compat.config.GroqCompatConfig", + provider_data_validator="llama_stack.providers.remote.inference.groq_openai_compat.config.GroqProviderDataValidator", + ), + ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="sambanova-openai-compat", + pip_packages=["litellm"], + module="llama_stack.providers.remote.inference.sambanova_openai_compat", + config_class="llama_stack.providers.remote.inference.sambanova_openai_compat.config.SambaNovaCompatConfig", + provider_data_validator="llama_stack.providers.remote.inference.sambanova_openai_compat.config.SambaNovaProviderDataValidator", + ), + ), + remote_provider_spec( + api=Api.inference, + adapter=AdapterSpec( + adapter_type="cerebras-openai-compat", + pip_packages=["litellm"], + module="llama_stack.providers.remote.inference.cerebras_openai_compat", + config_class="llama_stack.providers.remote.inference.cerebras_openai_compat.config.CerebrasCompatConfig", + provider_data_validator="llama_stack.providers.remote.inference.cerebras_openai_compat.config.CerebrasProviderDataValidator", + ), + ), remote_provider_spec( api=Api.inference, adapter=AdapterSpec( diff --git a/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py b/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py new file mode 100644 index 000000000..a5f07edd2 --- /dev/null +++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import Inference + +from .config import CerebrasCompatConfig + + +async def get_adapter_impl(config: CerebrasCompatConfig, _deps) -> Inference: + # import dynamically so the import is used only when it is needed + from .cerebras import CerebrasCompatInferenceAdapter + + adapter = CerebrasCompatInferenceAdapter(config) + return adapter diff --git a/llama_stack/providers/remote/inference/cerebras_openai_compat/cerebras.py b/llama_stack/providers/remote/inference/cerebras_openai_compat/cerebras.py new file mode 100644 index 000000000..b3f109dcc --- /dev/null +++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/cerebras.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.remote.inference.cerebras_openai_compat.config import CerebrasCompatConfig +from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin + +from ..cerebras.models import MODEL_ENTRIES + + +class CerebrasCompatInferenceAdapter(LiteLLMOpenAIMixin): + _config: CerebrasCompatConfig + + def __init__(self, config: CerebrasCompatConfig): + LiteLLMOpenAIMixin.__init__( + self, + model_entries=MODEL_ENTRIES, + api_key_from_config=config.api_key, + provider_data_api_key_field="cerebras_api_key", + openai_compat_api_base=config.openai_compat_api_base, + ) + self.config = config + + async def initialize(self): + await super().initialize() + + async def shutdown(self): + await super().shutdown() diff --git a/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py b/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py new file mode 100644 index 000000000..149c0a202 --- /dev/null +++ b/llama_stack/providers/remote/inference/cerebras_openai_compat/config.py @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +class CerebrasProviderDataValidator(BaseModel): + cerebras_api_key: Optional[str] = Field( + default=None, + description="API key for Cerebras models", + ) + + +@json_schema_type +class CerebrasCompatConfig(BaseModel): + api_key: Optional[str] = Field( + default=None, + description="The Cerebras API key", + ) + + openai_compat_api_base: str = Field( + default="https://api.cerebras.ai/v1", + description="The URL for the Cerebras API server", + ) + + @classmethod + def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> Dict[str, Any]: + return { + "openai_compat_api_base": "https://api.cerebras.ai/v1", + "api_key": api_key, + } diff --git a/llama_stack/providers/remote/inference/fireworks/models.py b/llama_stack/providers/remote/inference/fireworks/models.py index 4975d061f..027eeab8d 100644 --- a/llama_stack/providers/remote/inference/fireworks/models.py +++ b/llama_stack/providers/remote/inference/fireworks/models.py @@ -48,6 +48,14 @@ MODEL_ENTRIES = [ "accounts/fireworks/models/llama-guard-3-11b-vision", CoreModelId.llama_guard_3_11b_vision.value, ), + build_hf_repo_model_entry( + "accounts/fireworks/models/llama4-scout-instruct-basic", + CoreModelId.llama4_scout_17b_16e_instruct.value, + ), + build_hf_repo_model_entry( + "accounts/fireworks/models/llama4-maverick-instruct-basic", + CoreModelId.llama4_maverick_17b_128e_instruct.value, + ), ProviderModelEntry( provider_model_id="nomic-ai/nomic-embed-text-v1.5", model_type=ModelType.embedding, diff --git a/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py b/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py new file mode 100644 index 000000000..f78f218b5 --- /dev/null +++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import Inference + +from .config import FireworksCompatConfig + + +async def get_adapter_impl(config: FireworksCompatConfig, _deps) -> Inference: + # import dynamically so the import is used only when it is needed + from .fireworks import FireworksCompatInferenceAdapter + + adapter = FireworksCompatInferenceAdapter(config) + return adapter diff --git a/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py b/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py new file mode 100644 index 000000000..0263d348a --- /dev/null +++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/config.py @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +class FireworksProviderDataValidator(BaseModel): + fireworks_api_key: Optional[str] = Field( + default=None, + description="API key for Fireworks models", + ) + + +@json_schema_type +class FireworksCompatConfig(BaseModel): + api_key: Optional[str] = Field( + default=None, + description="The Fireworks API key", + ) + + openai_compat_api_base: str = Field( + default="https://api.fireworks.ai/inference/v1", + description="The URL for the Fireworks API server", + ) + + @classmethod + def sample_run_config(cls, api_key: str = "${env.FIREWORKS_API_KEY}", **kwargs) -> Dict[str, Any]: + return { + "openai_compat_api_base": "https://api.fireworks.ai/inference/v1", + "api_key": api_key, + } diff --git a/llama_stack/providers/remote/inference/fireworks_openai_compat/fireworks.py b/llama_stack/providers/remote/inference/fireworks_openai_compat/fireworks.py new file mode 100644 index 000000000..f6045e0eb --- /dev/null +++ b/llama_stack/providers/remote/inference/fireworks_openai_compat/fireworks.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.remote.inference.fireworks_openai_compat.config import FireworksCompatConfig +from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin + +from ..fireworks.models import MODEL_ENTRIES + + +class FireworksCompatInferenceAdapter(LiteLLMOpenAIMixin): + _config: FireworksCompatConfig + + def __init__(self, config: FireworksCompatConfig): + LiteLLMOpenAIMixin.__init__( + self, + model_entries=MODEL_ENTRIES, + api_key_from_config=config.api_key, + provider_data_api_key_field="fireworks_api_key", + openai_compat_api_base=config.openai_compat_api_base, + ) + self.config = config + + async def initialize(self): + await super().initialize() + + async def shutdown(self): + await super().shutdown() diff --git a/llama_stack/providers/remote/inference/groq/models.py b/llama_stack/providers/remote/inference/groq/models.py index 08b9b4dc4..d0c10ca62 100644 --- a/llama_stack/providers/remote/inference/groq/models.py +++ b/llama_stack/providers/remote/inference/groq/models.py @@ -35,4 +35,12 @@ MODEL_ENTRIES = [ "groq/llama-3.2-3b-preview", CoreModelId.llama3_2_3b_instruct.value, ), + build_hf_repo_model_entry( + "groq/llama-4-scout-17b-16e-instruct", + CoreModelId.llama4_scout_17b_16e_instruct.value, + ), + build_hf_repo_model_entry( + "groq/llama-4-maverick-17b-128e-instruct", + CoreModelId.llama4_maverick_17b_128e_instruct.value, + ), ] diff --git a/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py b/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py new file mode 100644 index 000000000..8161df20d --- /dev/null +++ b/llama_stack/providers/remote/inference/groq_openai_compat/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import Inference + +from .config import GroqCompatConfig + + +async def get_adapter_impl(config: GroqCompatConfig, _deps) -> Inference: + # import dynamically so the import is used only when it is needed + from .groq import GroqCompatInferenceAdapter + + adapter = GroqCompatInferenceAdapter(config) + return adapter diff --git a/llama_stack/providers/remote/inference/groq_openai_compat/config.py b/llama_stack/providers/remote/inference/groq_openai_compat/config.py new file mode 100644 index 000000000..4b90b4576 --- /dev/null +++ b/llama_stack/providers/remote/inference/groq_openai_compat/config.py @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +class GroqProviderDataValidator(BaseModel): + groq_api_key: Optional[str] = Field( + default=None, + description="API key for Groq models", + ) + + +@json_schema_type +class GroqCompatConfig(BaseModel): + api_key: Optional[str] = Field( + default=None, + description="The Groq API key", + ) + + openai_compat_api_base: str = Field( + default="https://api.groq.com/openai/v1", + description="The URL for the Groq API server", + ) + + @classmethod + def sample_run_config(cls, api_key: str = "${env.GROQ_API_KEY}", **kwargs) -> Dict[str, Any]: + return { + "openai_compat_api_base": "https://api.groq.com/openai/v1", + "api_key": api_key, + } diff --git a/llama_stack/providers/remote/inference/groq_openai_compat/groq.py b/llama_stack/providers/remote/inference/groq_openai_compat/groq.py new file mode 100644 index 000000000..30e18cd06 --- /dev/null +++ b/llama_stack/providers/remote/inference/groq_openai_compat/groq.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.remote.inference.groq_openai_compat.config import GroqCompatConfig +from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin + +from ..groq.models import MODEL_ENTRIES + + +class GroqCompatInferenceAdapter(LiteLLMOpenAIMixin): + _config: GroqCompatConfig + + def __init__(self, config: GroqCompatConfig): + LiteLLMOpenAIMixin.__init__( + self, + model_entries=MODEL_ENTRIES, + api_key_from_config=config.api_key, + provider_data_api_key_field="groq_api_key", + openai_compat_api_base=config.openai_compat_api_base, + ) + self.config = config + + async def initialize(self): + await super().initialize() + + async def shutdown(self): + await super().shutdown() diff --git a/llama_stack/providers/remote/inference/sambanova/models.py b/llama_stack/providers/remote/inference/sambanova/models.py index 9589ea268..43041e94a 100644 --- a/llama_stack/providers/remote/inference/sambanova/models.py +++ b/llama_stack/providers/remote/inference/sambanova/models.py @@ -46,4 +46,8 @@ MODEL_ENTRIES = [ "Meta-Llama-Guard-3-8B", CoreModelId.llama_guard_3_8b.value, ), + build_hf_repo_model_entry( + "Llama-4-Scout-17B-16E-Instruct", + CoreModelId.llama4_scout_17b_16e_instruct.value, + ), ] diff --git a/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py b/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py new file mode 100644 index 000000000..e31a3364c --- /dev/null +++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import Inference + +from .config import SambaNovaCompatConfig + + +async def get_adapter_impl(config: SambaNovaCompatConfig, _deps) -> Inference: + # import dynamically so the import is used only when it is needed + from .sambanova import SambaNovaCompatInferenceAdapter + + adapter = SambaNovaCompatInferenceAdapter(config) + return adapter diff --git a/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py b/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py new file mode 100644 index 000000000..b792cb6e7 --- /dev/null +++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/config.py @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +class SambaNovaProviderDataValidator(BaseModel): + sambanova_api_key: Optional[str] = Field( + default=None, + description="API key for SambaNova models", + ) + + +@json_schema_type +class SambaNovaCompatConfig(BaseModel): + api_key: Optional[str] = Field( + default=None, + description="The SambaNova API key", + ) + + openai_compat_api_base: str = Field( + default="https://api.sambanova.ai/v1", + description="The URL for the SambaNova API server", + ) + + @classmethod + def sample_run_config(cls, api_key: str = "${env.SAMBANOVA_API_KEY}", **kwargs) -> Dict[str, Any]: + return { + "openai_compat_api_base": "https://api.sambanova.ai/v1", + "api_key": api_key, + } diff --git a/llama_stack/providers/remote/inference/sambanova_openai_compat/sambanova.py b/llama_stack/providers/remote/inference/sambanova_openai_compat/sambanova.py new file mode 100644 index 000000000..aa59028b6 --- /dev/null +++ b/llama_stack/providers/remote/inference/sambanova_openai_compat/sambanova.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.remote.inference.sambanova_openai_compat.config import SambaNovaCompatConfig +from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin + +from ..sambanova.models import MODEL_ENTRIES + + +class SambaNovaCompatInferenceAdapter(LiteLLMOpenAIMixin): + _config: SambaNovaCompatConfig + + def __init__(self, config: SambaNovaCompatConfig): + LiteLLMOpenAIMixin.__init__( + self, + model_entries=MODEL_ENTRIES, + api_key_from_config=config.api_key, + provider_data_api_key_field="sambanova_api_key", + openai_compat_api_base=config.openai_compat_api_base, + ) + self.config = config + + async def initialize(self): + await super().initialize() + + async def shutdown(self): + await super().shutdown() diff --git a/llama_stack/providers/remote/inference/together/models.py b/llama_stack/providers/remote/inference/together/models.py index f014c03f0..f4b259767 100644 --- a/llama_stack/providers/remote/inference/together/models.py +++ b/llama_stack/providers/remote/inference/together/models.py @@ -64,4 +64,18 @@ MODEL_ENTRIES = [ "context_length": 32768, }, ), + build_hf_repo_model_entry( + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + CoreModelId.llama4_scout_17b_16e_instruct.value, + additional_aliases=[ + "together/meta-llama/Llama-4-Scout-17B-16E-Instruct", + ], + ), + build_hf_repo_model_entry( + "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + CoreModelId.llama4_maverick_17b_128e_instruct.value, + additional_aliases=[ + "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + ], + ), ] diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index a4e02f2cb..df7610935 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -118,7 +118,7 @@ class TogetherInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProvi async def _stream_completion(self, request: CompletionRequest) -> AsyncGenerator: params = await self._get_params(request) - client = await self._get_client() + client = self._get_client() stream = await client.completions.create(**params) async for chunk in process_completion_stream_response(stream): yield chunk diff --git a/llama_stack/providers/remote/inference/together_openai_compat/__init__.py b/llama_stack/providers/remote/inference/together_openai_compat/__init__.py new file mode 100644 index 000000000..6fdf05b7e --- /dev/null +++ b/llama_stack/providers/remote/inference/together_openai_compat/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.inference import Inference + +from .config import TogetherCompatConfig + + +async def get_adapter_impl(config: TogetherCompatConfig, _deps) -> Inference: + # import dynamically so the import is used only when it is needed + from .together import TogetherCompatInferenceAdapter + + adapter = TogetherCompatInferenceAdapter(config) + return adapter diff --git a/llama_stack/providers/remote/inference/together_openai_compat/config.py b/llama_stack/providers/remote/inference/together_openai_compat/config.py new file mode 100644 index 000000000..120adbed9 --- /dev/null +++ b/llama_stack/providers/remote/inference/together_openai_compat/config.py @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, Optional + +from pydantic import BaseModel, Field + +from llama_stack.schema_utils import json_schema_type + + +class TogetherProviderDataValidator(BaseModel): + together_api_key: Optional[str] = Field( + default=None, + description="API key for Together models", + ) + + +@json_schema_type +class TogetherCompatConfig(BaseModel): + api_key: Optional[str] = Field( + default=None, + description="The Together API key", + ) + + openai_compat_api_base: str = Field( + default="https://api.together.xyz/v1", + description="The URL for the Together API server", + ) + + @classmethod + def sample_run_config(cls, api_key: str = "${env.TOGETHER_API_KEY}", **kwargs) -> Dict[str, Any]: + return { + "openai_compat_api_base": "https://api.together.xyz/v1", + "api_key": api_key, + } diff --git a/llama_stack/providers/remote/inference/together_openai_compat/together.py b/llama_stack/providers/remote/inference/together_openai_compat/together.py new file mode 100644 index 000000000..b463f5c35 --- /dev/null +++ b/llama_stack/providers/remote/inference/together_openai_compat/together.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.providers.remote.inference.together_openai_compat.config import TogetherCompatConfig +from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin + +from ..together.models import MODEL_ENTRIES + + +class TogetherCompatInferenceAdapter(LiteLLMOpenAIMixin): + _config: TogetherCompatConfig + + def __init__(self, config: TogetherCompatConfig): + LiteLLMOpenAIMixin.__init__( + self, + model_entries=MODEL_ENTRIES, + api_key_from_config=config.api_key, + provider_data_api_key_field="together_api_key", + openai_compat_api_base=config.openai_compat_api_base, + ) + self.config = config + + async def initialize(self): + await super().initialize() + + async def shutdown(self): + await super().shutdown() diff --git a/llama_stack/providers/utils/inference/model_registry.py b/llama_stack/providers/utils/inference/model_registry.py index a11c734df..4d7063953 100644 --- a/llama_stack/providers/utils/inference/model_registry.py +++ b/llama_stack/providers/utils/inference/model_registry.py @@ -33,12 +33,17 @@ def get_huggingface_repo(model_descriptor: str) -> Optional[str]: return None -def build_hf_repo_model_entry(provider_model_id: str, model_descriptor: str) -> ProviderModelEntry: +def build_hf_repo_model_entry( + provider_model_id: str, model_descriptor: str, additional_aliases: Optional[List[str]] = None +) -> ProviderModelEntry: + aliases = [ + get_huggingface_repo(model_descriptor), + ] + if additional_aliases: + aliases.extend(additional_aliases) return ProviderModelEntry( provider_model_id=provider_model_id, - aliases=[ - get_huggingface_repo(model_descriptor), - ], + aliases=aliases, llama_model=model_descriptor, ) diff --git a/llama_stack/providers/utils/inference/openai_compat.py b/llama_stack/providers/utils/inference/openai_compat.py index 44a89dfb0..0f3945b34 100644 --- a/llama_stack/providers/utils/inference/openai_compat.py +++ b/llama_stack/providers/utils/inference/openai_compat.py @@ -642,6 +642,36 @@ PYTHON_TYPE_TO_LITELLM_TYPE = { } +def to_openai_param_type(param_type: str) -> dict: + """ + Convert Python type hints to OpenAI parameter type format. + + Examples: + 'str' -> {'type': 'string'} + 'int' -> {'type': 'integer'} + 'list[str]' -> {'type': 'array', 'items': {'type': 'string'}} + 'list[int]' -> {'type': 'array', 'items': {'type': 'integer'}} + """ + # Handle basic types first + basic_types = { + "str": "string", + "int": "integer", + "float": "number", + "bool": "boolean", + } + + if param_type in basic_types: + return {"type": basic_types[param_type]} + + # Handle list/array types + if param_type.startswith("list[") and param_type.endswith("]"): + inner_type = param_type[5:-1] + if inner_type in basic_types: + return {"type": "array", "items": {"type": basic_types.get(inner_type, inner_type)}} + + return {"type": param_type} + + def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict: """ Convert a ToolDefinition to an OpenAI API-compatible dictionary. @@ -702,7 +732,7 @@ def convert_tooldef_to_openai_tool(tool: ToolDefinition) -> dict: properties = parameters["properties"] required = [] for param_name, param in tool.parameters.items(): - properties[param_name] = {"type": PYTHON_TYPE_TO_LITELLM_TYPE.get(param.param_type, param.param_type)} + properties[param_name] = to_openai_param_type(param.param_type) if param.description: properties[param_name].update(description=param.description) if param.default: diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml index 04bbe212e..3c16dd5ea 100644 --- a/llama_stack/templates/ci-tests/run.yaml +++ b/llama_stack/templates/ci-tests/run.yaml @@ -193,6 +193,26 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-scout-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm - metadata: embedding_dimension: 768 context_length: 8192 diff --git a/llama_stack/templates/dependencies.json b/llama_stack/templates/dependencies.json index b8f475cea..053d6ef8a 100644 --- a/llama_stack/templates/dependencies.json +++ b/llama_stack/templates/dependencies.json @@ -682,6 +682,45 @@ "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], + "verification": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "emoji", + "fastapi", + "fire", + "httpx", + "langdetect", + "litellm", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "pythainlp", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "sqlite-vec", + "tqdm", + "transformers", + "tree_sitter", + "uvicorn", + "sentence-transformers --no-deps", + "torch torchvision --index-url https://download.pytorch.org/whl/cpu" + ], "vllm-gpu": [ "aiosqlite", "autoevals", diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml index b4546ca58..ea3b7252a 100644 --- a/llama_stack/templates/dev/run.yaml +++ b/llama_stack/templates/dev/run.yaml @@ -251,6 +251,26 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-scout-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm - metadata: embedding_dimension: 768 context_length: 8192 @@ -356,6 +376,26 @@ models: provider_id: groq provider_model_id: groq/llama-3.2-3b-preview model_type: llm +- metadata: {} + model_id: groq/llama-4-scout-17b-16e-instruct + provider_id: groq + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: groq + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: groq/llama-4-maverick-17b-128e-instruct + provider_id: groq + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: groq + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index 125c66177..aa6209db6 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -205,6 +205,26 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-scout-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm - metadata: embedding_dimension: 768 context_length: 8192 diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 7b3c059e5..834ec8260 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -200,6 +200,26 @@ models: provider_id: fireworks provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-scout-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: fireworks + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm - metadata: embedding_dimension: 768 context_length: 8192 diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml index 6c83ed43d..f557e64fd 100644 --- a/llama_stack/templates/groq/run.yaml +++ b/llama_stack/templates/groq/run.yaml @@ -148,6 +148,26 @@ models: provider_id: groq provider_model_id: groq/llama-3.2-3b-preview model_type: llm +- metadata: {} + model_id: groq/llama-4-scout-17b-16e-instruct + provider_id: groq + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: groq + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: groq/llama-4-maverick-17b-128e-instruct + provider_id: groq + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: groq + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index a64ada759..e4e8e4e21 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -165,6 +165,16 @@ models: provider_id: sambanova provider_model_id: Meta-Llama-Guard-3-8B model_type: llm +- metadata: {} + model_id: Llama-4-Scout-17B-16E-Instruct + provider_id: sambanova + provider_model_id: Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: sambanova + provider_model_id: Llama-4-Scout-17B-16E-Instruct + model_type: llm shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index 1fbf64e40..105ce896d 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -219,6 +219,36 @@ models: provider_id: together provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval model_type: embedding +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: together + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: together + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index d71aea640..1f1613655 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -214,6 +214,36 @@ models: provider_id: together provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval model_type: embedding +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: together + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: together + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: together + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 diff --git a/llama_stack/templates/verification/__init__.py b/llama_stack/templates/verification/__init__.py new file mode 100644 index 000000000..5d8c281a6 --- /dev/null +++ b/llama_stack/templates/verification/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .verification import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/verification/build.yaml b/llama_stack/templates/verification/build.yaml new file mode 100644 index 000000000..9f010d651 --- /dev/null +++ b/llama_stack/templates/verification/build.yaml @@ -0,0 +1,38 @@ +version: '2' +distribution_spec: + description: Distribution for running e2e tests in CI + providers: + inference: + - remote::openai + - remote::fireworks-openai-compat + - remote::together-openai-compat + - remote::groq-openai-compat + - remote::sambanova-openai-compat + - remote::cerebras-openai-compat + - inline::sentence-transformers + vector_io: + - inline::sqlite-vec + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust + tool_runtime: + - remote::brave-search + - remote::tavily-search + - inline::code-interpreter + - inline::rag-runtime + - remote::model-context-protocol +image_type: conda diff --git a/llama_stack/templates/verification/run.yaml b/llama_stack/templates/verification/run.yaml new file mode 100644 index 000000000..b6c2ca98d --- /dev/null +++ b/llama_stack/templates/verification/run.yaml @@ -0,0 +1,626 @@ +version: '2' +image_name: verification +apis: +- agents +- datasetio +- eval +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY:} + - provider_id: fireworks-openai-compat + provider_type: remote::fireworks-openai-compat + config: + openai_compat_api_base: https://api.fireworks.ai/inference/v1 + api_key: ${env.FIREWORKS_API_KEY:} + - provider_id: together-openai-compat + provider_type: remote::together-openai-compat + config: + openai_compat_api_base: https://api.together.xyz/v1 + api_key: ${env.TOGETHER_API_KEY:} + - provider_id: groq-openai-compat + provider_type: remote::groq-openai-compat + config: + openai_compat_api_base: https://api.groq.com/openai/v1 + api_key: ${env.GROQ_API_KEY:} + - provider_id: sambanova-openai-compat + provider_type: remote::sambanova-openai-compat + config: + openai_compat_api_base: https://api.sambanova.ai/v1 + api_key: ${env.SAMBANOVA_API_KEY:} + - provider_id: cerebras-openai-compat + provider_type: remote::cerebras-openai-compat + config: + openai_compat_api_base: https://api.cerebras.ai/v1 + api_key: ${env.CEREBRAS_API_KEY:} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: sqlite-vec + provider_type: inline::sqlite-vec + config: + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/sqlite_vec.db + - provider_id: ${env.ENABLE_CHROMADB+chromadb} + provider_type: remote::chromadb + config: + url: ${env.CHROMADB_URL:} + - provider_id: ${env.ENABLE_PGVECTOR+pgvector} + provider_type: remote::pgvector + config: + host: ${env.PGVECTOR_HOST:localhost} + port: ${env.PGVECTOR_PORT:5432} + db: ${env.PGVECTOR_DB:} + user: ${env.PGVECTOR_USER:} + password: ${env.PGVECTOR_PASSWORD:} + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:\u200B}" + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/verification/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/verification}/registry.db +models: +- metadata: {} + model_id: openai/gpt-4o + provider_id: openai + provider_model_id: openai/gpt-4o + model_type: llm +- metadata: {} + model_id: openai/gpt-4o-mini + provider_id: openai + provider_model_id: openai/gpt-4o-mini + model_type: llm +- metadata: {} + model_id: openai/chatgpt-4o-latest + provider_id: openai + provider_model_id: openai/chatgpt-4o-latest + model_type: llm +- metadata: + embedding_dimension: 1536 + context_length: 8192 + model_id: openai/text-embedding-3-small + provider_id: openai + provider_model_id: openai/text-embedding-3-small + model_type: embedding +- metadata: + embedding_dimension: 3072 + context_length: 8192 + model_id: openai/text-embedding-3-large + provider_id: openai + provider_model_id: openai/text-embedding-3-large + model_type: embedding +- metadata: {} + model_id: accounts/fireworks/models/llama-v3p1-8b-instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-v3p1-70b-instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-70B-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-v3p1-405b-instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-v3p2-3b-instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-3B-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-v3p3-70b-instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.3-70B-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-guard-3-8b + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-guard-3-8b + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-Guard-3-8B + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-guard-3-8b + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama-guard-3-11b-vision + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-Guard-3-11B-Vision + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-scout-instruct-basic + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic + model_type: llm +- metadata: {} + model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: fireworks-openai-compat + provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic + model_type: llm +- metadata: + embedding_dimension: 768 + context_length: 8192 + model_id: nomic-ai/nomic-embed-text-v1.5 + provider_id: fireworks-openai-compat + provider_model_id: nomic-ai/nomic-embed-text-v1.5 + model_type: embedding +- metadata: {} + model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-70B-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-3B-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.3-70B-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Meta-Llama-Guard-3-8B + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-Guard-3-8B + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-Guard-3-8B + provider_id: together-openai-compat + provider_model_id: meta-llama/Meta-Llama-Guard-3-8B + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-Guard-3-11B-Vision + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo + model_type: llm +- metadata: + embedding_dimension: 768 + context_length: 8192 + model_id: togethercomputer/m2-bert-80M-8k-retrieval + provider_id: together-openai-compat + provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval + model_type: embedding +- metadata: + embedding_dimension: 768 + context_length: 32768 + model_id: togethercomputer/m2-bert-80M-32k-retrieval + provider_id: together-openai-compat + provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval + model_type: embedding +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: together-openai-compat + provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + model_type: llm +- metadata: {} + model_id: groq/llama3-8b-8192 + provider_id: groq-openai-compat + provider_model_id: groq/llama3-8b-8192 + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama3-8b-8192 + model_type: llm +- metadata: {} + model_id: groq/llama-3.1-8b-instant + provider_id: groq-openai-compat + provider_model_id: groq/llama-3.1-8b-instant + model_type: llm +- metadata: {} + model_id: groq/llama3-70b-8192 + provider_id: groq-openai-compat + provider_model_id: groq/llama3-70b-8192 + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3-70B-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama3-70b-8192 + model_type: llm +- metadata: {} + model_id: groq/llama-3.3-70b-versatile + provider_id: groq-openai-compat + provider_model_id: groq/llama-3.3-70b-versatile + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.3-70B-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama-3.3-70b-versatile + model_type: llm +- metadata: {} + model_id: groq/llama-3.2-3b-preview + provider_id: groq-openai-compat + provider_model_id: groq/llama-3.2-3b-preview + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-3B-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama-3.2-3b-preview + model_type: llm +- metadata: {} + model_id: groq/llama-4-scout-17b-16e-instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama-4-scout-17b-16e-instruct + model_type: llm +- metadata: {} + model_id: groq/llama-4-maverick-17b-128e-instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: groq-openai-compat + provider_model_id: groq/llama-4-maverick-17b-128e-instruct + model_type: llm +- metadata: {} + model_id: Meta-Llama-3.1-8B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.1-8B-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.1-8B-Instruct + model_type: llm +- metadata: {} + model_id: Meta-Llama-3.1-70B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.1-70B-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-70B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.1-70B-Instruct + model_type: llm +- metadata: {} + model_id: Meta-Llama-3.1-405B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.1-405B-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.1-405B-Instruct + model_type: llm +- metadata: {} + model_id: Meta-Llama-3.2-1B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.2-1B-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-1B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.2-1B-Instruct + model_type: llm +- metadata: {} + model_id: Meta-Llama-3.2-3B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.2-3B-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-3B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.2-3B-Instruct + model_type: llm +- metadata: {} + model_id: Meta-Llama-3.3-70B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.3-70B-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.3-70B-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-3.3-70B-Instruct + model_type: llm +- metadata: {} + model_id: Llama-3.2-11B-Vision-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Llama-3.2-11B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Llama-3.2-11B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: Llama-3.2-90B-Vision-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Llama-3.2-90B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Llama-3.2-90B-Vision-Instruct + model_type: llm +- metadata: {} + model_id: Meta-Llama-Guard-3-8B + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-Guard-3-8B + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-Guard-3-8B + provider_id: sambanova-openai-compat + provider_model_id: Meta-Llama-Guard-3-8B + model_type: llm +- metadata: {} + model_id: Llama-4-Scout-17B-16E-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: sambanova-openai-compat + provider_model_id: Llama-4-Scout-17B-16E-Instruct + model_type: llm +- metadata: {} + model_id: llama3.1-8b + provider_id: cerebras-openai-compat + provider_model_id: llama3.1-8b + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.1-8B-Instruct + provider_id: cerebras-openai-compat + provider_model_id: llama3.1-8b + model_type: llm +- metadata: {} + model_id: llama-3.3-70b + provider_id: cerebras-openai-compat + provider_model_id: llama-3.3-70b + model_type: llm +- metadata: {} + model_id: meta-llama/Llama-3.3-70B-Instruct + provider_id: cerebras-openai-compat + provider_model_id: llama-3.3-70b + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding +shields: +- shield_id: meta-llama/Llama-Guard-3-8B +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +- toolgroup_id: builtin::code_interpreter + provider_id: code-interpreter +server: + port: 8321 diff --git a/llama_stack/templates/verification/verification.py b/llama_stack/templates/verification/verification.py new file mode 100644 index 000000000..7ef8d6a96 --- /dev/null +++ b/llama_stack/templates/verification/verification.py @@ -0,0 +1,206 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List, Tuple + +from llama_stack.apis.models.models import ModelType +from llama_stack.distribution.datatypes import ( + ModelInput, + Provider, + ShieldInput, + ToolGroupInput, +) +from llama_stack.providers.inline.inference.sentence_transformers import ( + SentenceTransformersInferenceConfig, +) +from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( + SQLiteVectorIOConfig, +) +from llama_stack.providers.remote.inference.cerebras.models import MODEL_ENTRIES as CEREBRAS_MODEL_ENTRIES +from llama_stack.providers.remote.inference.cerebras_openai_compat.config import CerebrasCompatConfig +from llama_stack.providers.remote.inference.fireworks.models import ( + MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES, +) +from llama_stack.providers.remote.inference.fireworks_openai_compat.config import FireworksCompatConfig +from llama_stack.providers.remote.inference.groq.models import ( + MODEL_ENTRIES as GROQ_MODEL_ENTRIES, +) +from llama_stack.providers.remote.inference.groq_openai_compat.config import GroqCompatConfig +from llama_stack.providers.remote.inference.openai.config import OpenAIConfig +from llama_stack.providers.remote.inference.openai.models import ( + MODEL_ENTRIES as OPENAI_MODEL_ENTRIES, +) +from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES as SAMBANOVA_MODEL_ENTRIES +from llama_stack.providers.remote.inference.sambanova_openai_compat.config import SambaNovaCompatConfig +from llama_stack.providers.remote.inference.together.models import ( + MODEL_ENTRIES as TOGETHER_MODEL_ENTRIES, +) +from llama_stack.providers.remote.inference.together_openai_compat.config import TogetherCompatConfig +from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig +from llama_stack.providers.remote.vector_io.pgvector.config import ( + PGVectorVectorIOConfig, +) +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) + + +def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: + # in this template, we allow each API key to be optional + providers = [ + ( + "openai", + OPENAI_MODEL_ENTRIES, + OpenAIConfig.sample_run_config(api_key="${env.OPENAI_API_KEY:}"), + ), + ( + "fireworks-openai-compat", + FIREWORKS_MODEL_ENTRIES, + FireworksCompatConfig.sample_run_config(api_key="${env.FIREWORKS_API_KEY:}"), + ), + ( + "together-openai-compat", + TOGETHER_MODEL_ENTRIES, + TogetherCompatConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:}"), + ), + ( + "groq-openai-compat", + GROQ_MODEL_ENTRIES, + GroqCompatConfig.sample_run_config(api_key="${env.GROQ_API_KEY:}"), + ), + ( + "sambanova-openai-compat", + SAMBANOVA_MODEL_ENTRIES, + SambaNovaCompatConfig.sample_run_config(api_key="${env.SAMBANOVA_API_KEY:}"), + ), + ( + "cerebras-openai-compat", + CEREBRAS_MODEL_ENTRIES, + CerebrasCompatConfig.sample_run_config(api_key="${env.CEREBRAS_API_KEY:}"), + ), + ] + inference_providers = [] + available_models = {} + for provider_id, model_entries, config in providers: + inference_providers.append( + Provider( + provider_id=provider_id, + provider_type=f"remote::{provider_id}", + config=config, + ) + ) + available_models[provider_id] = model_entries + return inference_providers, available_models + + +def get_distribution_template() -> DistributionTemplate: + inference_providers, available_models = get_inference_providers() + providers = { + "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]), + "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], + "tool_runtime": [ + "remote::brave-search", + "remote::tavily-search", + "inline::code-interpreter", + "inline::rag-runtime", + "remote::model-context-protocol", + ], + } + name = "verification" + + vector_io_providers = [ + Provider( + provider_id="sqlite-vec", + provider_type="inline::sqlite-vec", + config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ), + Provider( + provider_id="${env.ENABLE_CHROMADB+chromadb}", + provider_type="remote::chromadb", + config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:}"), + ), + Provider( + provider_id="${env.ENABLE_PGVECTOR+pgvector}", + provider_type="remote::pgvector", + config=PGVectorVectorIOConfig.sample_run_config( + db="${env.PGVECTOR_DB:}", + user="${env.PGVECTOR_USER:}", + password="${env.PGVECTOR_PASSWORD:}", + ), + ), + ] + embedding_provider = Provider( + provider_id="sentence-transformers", + provider_type="inline::sentence-transformers", + config=SentenceTransformersInferenceConfig.sample_run_config(), + ) + + default_tool_groups = [ + ToolGroupInput( + toolgroup_id="builtin::websearch", + provider_id="tavily-search", + ), + ToolGroupInput( + toolgroup_id="builtin::rag", + provider_id="rag-runtime", + ), + ToolGroupInput( + toolgroup_id="builtin::code_interpreter", + provider_id="code-interpreter", + ), + ] + embedding_model = ModelInput( + model_id="all-MiniLM-L6-v2", + provider_id=embedding_provider.provider_id, + model_type=ModelType.embedding, + metadata={ + "embedding_dimension": 384, + }, + ) + + default_models = get_model_registry(available_models) + return DistributionTemplate( + name=name, + distro_type="self_hosted", + description="Distribution for running e2e tests in CI", + container_image=None, + template_path=None, + providers=providers, + available_models_by_provider=available_models, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": inference_providers + [embedding_provider], + "vector_io": vector_io_providers, + }, + default_models=default_models + [embedding_model], + default_tool_groups=default_tool_groups, + default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], + ), + }, + run_config_env_vars={ + "LLAMA_STACK_PORT": ( + "8321", + "Port for the Llama Stack distribution server", + ), + "FIREWORKS_API_KEY": ( + "", + "Fireworks API Key", + ), + "OPENAI_API_KEY": ( + "", + "OpenAI API Key", + ), + }, + ) diff --git a/tests/integration/inference/test_text_inference.py b/tests/integration/inference/test_text_inference.py index 33ce07304..c8cceb0eb 100644 --- a/tests/integration/inference/test_text_inference.py +++ b/tests/integration/inference/test_text_inference.py @@ -6,6 +6,7 @@ import os +from time import sleep import pytest from pydantic import BaseModel @@ -23,11 +24,15 @@ def skip_if_model_doesnt_support_completion(client_with_models, model_id): provider_id = models[model_id].provider_id providers = {p.provider_id: p for p in client_with_models.providers.list()} provider = providers[provider_id] - if provider.provider_type in ( - "remote::openai", - "remote::anthropic", - "remote::gemini", - "remote::groq", + if ( + provider.provider_type + in ( + "remote::openai", + "remote::anthropic", + "remote::gemini", + "remote::groq", + ) + or "openai-compat" in provider.provider_type ): pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support completion") @@ -514,7 +519,7 @@ def test_text_chat_completion_tool_calling_tools_not_in_request( ) def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, text_model_id, test_case): """This test tests the model's tool calling loop in various scenarios""" - if "llama-4" not in text_model_id.lower(): + if "llama-4" not in text_model_id.lower() and "llama4" not in text_model_id.lower(): pytest.xfail("Not tested for non-llama4 models yet") tc = TestCase(test_case) @@ -545,7 +550,7 @@ def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, t ) op_msg = response.completion_message messages.append(op_msg.model_dump()) - # pprint(op_msg) + # print(op_msg) assert op_msg.role == "assistant" expected = tc["expected"].pop(0) @@ -568,3 +573,6 @@ def test_text_chat_completion_with_multi_turn_tool_calling(client_with_models, t actual_answer = op_msg.content.lower() # pprint(actual_answer) assert expected["answer"] in actual_answer + + # sleep to avoid rate limit + sleep(1) diff --git a/tests/integration/inference/test_vision_inference.py b/tests/integration/inference/test_vision_inference.py index a4fd5bcc6..d47dd3d64 100644 --- a/tests/integration/inference/test_vision_inference.py +++ b/tests/integration/inference/test_vision_inference.py @@ -76,8 +76,9 @@ def multi_image_data(): @pytest.mark.parametrize("stream", [True, False]) def test_image_chat_completion_multiple_images(client_with_models, vision_model_id, multi_image_data, stream): - if "llama-4" not in vision_model_id.lower() and "gpt-4o" not in vision_model_id.lower(): - pytest.skip("Skip for non-llama4, gpt4o models") + supported_models = ["llama-4", "gpt-4o", "llama4"] + if not any(model in vision_model_id.lower() for model in supported_models): + pytest.skip(f"Skip for non-supported model: {vision_model_id}") messages = [ { diff --git a/tests/integration/test_cases/inference/chat_completion.json b/tests/integration/test_cases/inference/chat_completion.json index c84d29e64..01956bd59 100644 --- a/tests/integration/test_cases/inference/chat_completion.json +++ b/tests/integration/test_cases/inference/chat_completion.json @@ -220,7 +220,7 @@ "description": "Availability status of the product." }, "tags": { - "param_type": "list", + "param_type": "list[str]", "description": "List of product tags" } } @@ -294,7 +294,7 @@ "description": "Location of the event" }, "participants": { - "param_type": "list", + "param_type": "list[str]", "description": "List of participant names" } } From c6e93e32f62cbe8e67bc56e0822fc2ff1b04f48b Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Tue, 8 Apr 2025 03:46:13 -0400 Subject: [PATCH 08/24] feat: Updated playground rag to use session id for persistent conversation (#1870) # What does this PR do? This PR updates the [playground RAG example](llama_stack/distribution/ui/page/playground/rag.py) so that the agent is able to use its builtin conversation history. Here we are using streamlit's `cache_resource` functionality to prevent the agent from re-initializing after every interaction as well as storing its session_id in the `session_state`. This allows the agent in the RAG example to behave more closely to how it works using the python-client directly. [//]: # (If resolving an issue, uncomment and update the line below) Closes #1869 ## Test Plan Without these changes, if you ask it "What is 2 + 2"? followed by the question "What did I just ask?" It will provide an obviously incorrect answer. With these changes, you can ask the same series of questions and it will provide the correct answer. [//]: # (## Documentation) Signed-off-by: Michael Clifford --- .../distribution/ui/page/playground/rag.py | 48 +++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/llama_stack/distribution/ui/page/playground/rag.py b/llama_stack/distribution/ui/page/playground/rag.py index fcd0f908e..bb31bd2a7 100644 --- a/llama_stack/distribution/ui/page/playground/rag.py +++ b/llama_stack/distribution/ui/page/playground/rag.py @@ -4,6 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import uuid + import streamlit as st from llama_stack_client import Agent, AgentEventLogger, RAGDocument @@ -102,8 +104,8 @@ def rag_chat_page(): # Add clear chat button to sidebar if st.button("Clear Chat", use_container_width=True): - st.session_state.messages = [] - st.rerun() + st.session_state.clear() + st.cache_resource.clear() # Chat Interface if "messages" not in st.session_state: @@ -123,23 +125,31 @@ def rag_chat_page(): else: strategy = {"type": "greedy"} - agent = Agent( - llama_stack_api.client, - model=selected_model, - instructions=system_prompt, - sampling_params={ - "strategy": strategy, - }, - tools=[ - dict( - name="builtin::rag/knowledge_search", - args={ - "vector_db_ids": list(selected_vector_dbs), - }, - ) - ], - ) - session_id = agent.create_session("rag-session") + @st.cache_resource + def create_agent(): + return Agent( + llama_stack_api.client, + model=selected_model, + instructions=system_prompt, + sampling_params={ + "strategy": strategy, + }, + tools=[ + dict( + name="builtin::rag/knowledge_search", + args={ + "vector_db_ids": list(selected_vector_dbs), + }, + ) + ], + ) + + agent = create_agent() + + if "agent_session_id" not in st.session_state: + st.session_state["agent_session_id"] = agent.create_session(session_name=f"rag_demo_{uuid.uuid4()}") + + session_id = st.session_state["agent_session_id"] # Chat input if prompt := st.chat_input("Ask a question about your documents"): From 031a40bec0ccd2ff66967c40037726dde7055127 Mon Sep 17 00:00:00 2001 From: ehhuang Date: Tue, 8 Apr 2025 09:07:25 -0700 Subject: [PATCH 09/24] fix: type (#1898) # What does this PR do? ## Test Plan --- llama_stack/templates/verification/verification.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llama_stack/templates/verification/verification.py b/llama_stack/templates/verification/verification.py index 7ef8d6a96..e6f74aad8 100644 --- a/llama_stack/templates/verification/verification.py +++ b/llama_stack/templates/verification/verification.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import List, Tuple +from typing import Dict, List, Tuple from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ( @@ -43,6 +43,7 @@ from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOC from llama_stack.providers.remote.vector_io.pgvector.config import ( PGVectorVectorIOConfig, ) +from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry from llama_stack.templates.template import ( DistributionTemplate, RunConfigSettings, @@ -50,7 +51,7 @@ from llama_stack.templates.template import ( ) -def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: +def get_inference_providers() -> Tuple[List[Provider], Dict[str, List[ProviderModelEntry]]]: # in this template, we allow each API key to be optional providers = [ ( From 0431a6e90b5e1e325eb874a38b78b34cee9ae611 Mon Sep 17 00:00:00 2001 From: wesley chun Date: Tue, 8 Apr 2025 11:42:47 -0700 Subject: [PATCH 10/24] docs: colorize Discord badge & add icon in README (#1865) Update "chat" badge on README to make it more visible for visitors; changing the look from ![image](https://github.com/user-attachments/assets/630be671-a937-4841-8009-93e8eea1cbe1) ... to ... ![image](https://github.com/user-attachments/assets/cfcb946a-e266-48da-bd50-c994cf1e3a9d) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0a741f916..617e5117b 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/) [![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE) -[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack) +[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack) [![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain) [![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain) From 7d9adf22adc14a5ce5a639f0b6cbd3ba269e7ca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 9 Apr 2025 03:54:00 +0200 Subject: [PATCH 11/24] refactor: move missing tests to test directory (#1892) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the test_context.py under the main tests directory, and fix the code. The problem was that the function captures the initial values of the context variables and then restores those same initial values before each iteration. This means that any modifications made to the context variables during iteration are lost when the next iteration starts. Error was: ``` ====================================================== FAILURES ======================================================= ______________________________________ test_preserve_contexts_across_event_loops ______________________________________ @pytest.mark.asyncio async def test_preserve_contexts_across_event_loops(): """ Test that context variables are preserved across event loop boundaries with nested generators. This simulates the real-world scenario where: 1. A new event loop is created for each streaming request 2. The async generator runs inside that loop 3. There are multiple levels of nested generators 4. Context needs to be preserved across these boundaries """ # Create context variables request_id = ContextVar("request_id", default=None) user_id = ContextVar("user_id", default=None) # Set initial values # Results container to verify values across thread boundaries results = [] # Inner-most generator (level 2) async def inner_generator(): # Should have the context from the outer scope yield (1, request_id.get(), user_id.get()) # Modify one context variable user_id.set("user-modified") # Should reflect the modification yield (2, request_id.get(), user_id.get()) # Middle generator (level 1) async def middle_generator(): inner_gen = inner_generator() # Forward the first yield from inner item = await inner_gen.__anext__() yield item # Forward the second yield from inner item = await inner_gen.__anext__() yield item request_id.set("req-modified") # Add our own yield with both modified variables yield (3, request_id.get(), user_id.get()) # Function to run in a separate thread with a new event loop def run_in_new_loop(): # Create a new event loop for this thread loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: # Outer generator (runs in the new loop) async def outer_generator(): request_id.set("req-12345") user_id.set("user-6789") # Wrap the middle generator wrapped_gen = preserve_contexts_async_generator(middle_generator(), [request_id, user_id]) # Process all items from the middle generator async for item in wrapped_gen: # Store results for verification results.append(item) # Run the outer generator in the new loop loop.run_until_complete(outer_generator()) finally: loop.close() # Run the generator chain in a separate thread with a new event loop with ThreadPoolExecutor(max_workers=1) as executor: future = executor.submit(run_in_new_loop) future.result() # Wait for completion # Verify the results assert len(results) == 3 # First yield should have original values assert results[0] == (1, "req-12345", "user-6789") # Second yield should have modified user_id assert results[1] == (2, "req-12345", "user-modified") # Third yield should have both modified values > assert results[2] == (3, "req-modified", "user-modified") E AssertionError: assert (3, 'req-modified', 'user-6789') == (3, 'req-modified', 'user-modified') E E At index 2 diff: 'user-6789' != 'user-modified' E E Full diff: E ( E 3, E 'req-modified', E - 'user-modified', E + 'user-6789', E ) tests/unit/distribution/test_context.py:155: AssertionError -------------------------------------------------- Captured log call -------------------------------------------------- ERROR asyncio:base_events.py:1758 Task was destroyed but it is pending! task: ()>> ================================================== warnings summary =================================================== .venv/lib/python3.10/site-packages/pydantic/fields.py:1042 /Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.10/site-packages/pydantic/fields.py:1042: PydanticDeprecatedSince20: Using extra keyword arguments on `Field` is deprecated and will be removed. Use `json_schema_extra` instead. (Extra keys: 'contentEncoding'). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/ warn( -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html =============================================== short test summary info =============================================== FAILED tests/unit/distribution/test_context.py::test_preserve_contexts_across_event_loops - AssertionError: assert (3, 'req-modified', 'user-6789') == (3, 'req-modified', 'user-modified') At index 2 diff: 'user-6789' != 'user-modified' Full diff: ( 3, 'req-modified', - 'user-modified', + 'user-6789', ) ``` [//]: # (## Documentation) Signed-off-by: Sébastien Han --- llama_stack/distribution/utils/context.py | 5 +++++ .../utils/tests => tests/unit/distribution}/test_context.py | 0 2 files changed, 5 insertions(+) rename {llama_stack/distribution/utils/tests => tests/unit/distribution}/test_context.py (100%) diff --git a/llama_stack/distribution/utils/context.py b/llama_stack/distribution/utils/context.py index fcc72161d..c34079ac6 100644 --- a/llama_stack/distribution/utils/context.py +++ b/llama_stack/distribution/utils/context.py @@ -29,6 +29,11 @@ def preserve_contexts_async_generator( context_var.set(initial_context_values[context_var.name]) item = await gen.__anext__() + + # Update our tracked values with any changes made during this iteration + for context_var in context_vars: + initial_context_values[context_var.name] = context_var.get() + yield item except StopAsyncIteration: diff --git a/llama_stack/distribution/utils/tests/test_context.py b/tests/unit/distribution/test_context.py similarity index 100% rename from llama_stack/distribution/utils/tests/test_context.py rename to tests/unit/distribution/test_context.py From bcbc56baa2368f18823cd507c7d68c6fec5a768a Mon Sep 17 00:00:00 2001 From: ehhuang Date: Tue, 8 Apr 2025 21:21:38 -0700 Subject: [PATCH 12/24] feat: adds test suite to verify provider's OAI compat endpoints (#1901) # What does this PR do? ## Test Plan pytest verifications/openai/test_chat_completion.py --provider together --- tests/verifications/README.md | 65 + tests/verifications/REPORT.md | 88 + tests/verifications/__init__.py | 5 + tests/verifications/conftest.py | 28 + tests/verifications/generate_report.py | 485 +++ tests/verifications/openai/__init__.py | 5 + .../verifications/openai/fixtures/__init__.py | 5 + .../verifications/openai/fixtures/fixtures.py | 97 + tests/verifications/openai/fixtures/load.py | 16 + .../fixtures/test_cases/chat_completion.yaml | 162 + .../openai/test_chat_completion.py | 202 ++ .../test_results/fireworks_1744154308.json | 2744 ++++++++++++++++ .../test_results/openai_1744154522.json | 2672 ++++++++++++++++ .../test_results/together_1744154399.json | 2830 +++++++++++++++++ 14 files changed, 9404 insertions(+) create mode 100644 tests/verifications/README.md create mode 100644 tests/verifications/REPORT.md create mode 100644 tests/verifications/__init__.py create mode 100644 tests/verifications/conftest.py create mode 100755 tests/verifications/generate_report.py create mode 100644 tests/verifications/openai/__init__.py create mode 100644 tests/verifications/openai/fixtures/__init__.py create mode 100644 tests/verifications/openai/fixtures/fixtures.py create mode 100644 tests/verifications/openai/fixtures/load.py create mode 100644 tests/verifications/openai/fixtures/test_cases/chat_completion.yaml create mode 100644 tests/verifications/openai/test_chat_completion.py create mode 100644 tests/verifications/test_results/fireworks_1744154308.json create mode 100644 tests/verifications/test_results/openai_1744154522.json create mode 100644 tests/verifications/test_results/together_1744154399.json diff --git a/tests/verifications/README.md b/tests/verifications/README.md new file mode 100644 index 000000000..986ff1087 --- /dev/null +++ b/tests/verifications/README.md @@ -0,0 +1,65 @@ +# Llama Stack Verifications + +Llama Stack Verifications provide standardized test suites to ensure API compatibility and behavior consistency across different LLM providers. These tests help verify that different models and providers implement the expected interfaces and behaviors correctly. + +## Overview + +This framework allows you to run the same set of verification tests against different LLM providers' OpenAI-compatible endpoints (Fireworks, Together, Groq, Cerebras, etc., and OpenAI itself) to ensure they meet the expected behavior and interface standards. + +## Features + +The verification suite currently tests: + +- Basic chat completions (streaming and non-streaming) +- Image input capabilities +- Structured JSON output formatting +- Tool calling functionality + +## Running Tests + +To run the verification tests, use pytest with the following parameters: + +```bash +cd llama-stack +pytest tests/verifications/openai --provider= +``` + +Example: +```bash +# Run all tests +pytest tests/verifications/openai --provider=together + +# Only run tests with Llama 4 models +pytest tests/verifications/openai --provider=together -k 'Llama-4' +``` + +### Parameters + +- `--provider`: The provider name (openai, fireworks, together, groq, cerebras, etc.) +- `--base-url`: The base URL for the provider's API (optional - defaults to the standard URL for the specified provider) +- `--api-key`: Your API key for the provider (optional - defaults to the standard API_KEY name for the specified provider) + +## Supported Providers + +The verification suite currently supports: +- OpenAI +- Fireworks +- Together +- Groq +- Cerebras + +## Adding New Test Cases + +To add new test cases, create appropriate JSON files in the `openai/fixtures/test_cases/` directory following the existing patterns. + + +## Structure + +- `__init__.py` - Marks the directory as a Python package +- `conftest.py` - Global pytest configuration and fixtures +- `openai/` - Tests specific to OpenAI-compatible APIs + - `fixtures/` - Test fixtures and utilities + - `fixtures.py` - Provider-specific fixtures + - `load.py` - Utilities for loading test cases + - `test_cases/` - JSON test case definitions + - `test_chat_completion.py` - Tests for chat completion APIs diff --git a/tests/verifications/REPORT.md b/tests/verifications/REPORT.md new file mode 100644 index 000000000..d5715ae21 --- /dev/null +++ b/tests/verifications/REPORT.md @@ -0,0 +1,88 @@ +# Test Results Report + +*Generated on: 2025-04-08 21:14:02* + +*This report was generated by running `python tests/verifications/generate_report.py`* + +## Legend + +- ✅ - Test passed +- ❌ - Test failed +- ⚪ - Test not applicable or not run for this model + + +## Summary + +| Provider | Pass Rate | Tests Passed | Total Tests | +| --- | --- | --- | --- | +| Together | 67.7% | 21 | 31 | +| Fireworks | 90.3% | 28 | 31 | +| Openai | 100.0% | 22 | 22 | + + + +## Together + +*Tests run on: 2025-04-08 16:19:59* + +```bash +pytest tests/verifications/openai/test_chat_completion.py --provider=together -v +``` + +| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct | +| --- | --- | --- | --- | +| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ | +| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | ✅ | +| test_chat_streaming_basic (case 0) | ✅ | ❌ | ❌ | +| test_chat_streaming_basic (case 1) | ✅ | ❌ | ❌ | +| test_chat_streaming_image (case 0) | ⚪ | ❌ | ❌ | +| test_chat_streaming_structured_output (case 0) | ✅ | ❌ | ❌ | +| test_chat_streaming_structured_output (case 1) | ✅ | ❌ | ❌ | + +## Fireworks + +*Tests run on: 2025-04-08 16:18:28* + +```bash +pytest tests/verifications/openai/test_chat_completion.py --provider=fireworks -v +``` + +| Test | Llama-3.3-70B-Instruct | Llama-4-Maverick-17B-128E-Instruct | Llama-4-Scout-17B-16E-Instruct | +| --- | --- | --- | --- | +| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_image (case 0) | ⚪ | ✅ | ✅ | +| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | ✅ | +| test_chat_non_streaming_tool_calling (case 0) | ✅ | ❌ | ❌ | +| test_chat_streaming_basic (case 0) | ✅ | ✅ | ✅ | +| test_chat_streaming_basic (case 1) | ✅ | ✅ | ✅ | +| test_chat_streaming_image (case 0) | ⚪ | ✅ | ✅ | +| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | ✅ | +| test_chat_streaming_structured_output (case 1) | ❌ | ✅ | ✅ | + +## Openai + +*Tests run on: 2025-04-08 16:22:02* + +```bash +pytest tests/verifications/openai/test_chat_completion.py --provider=openai -v +``` + +| Test | gpt-4o | gpt-4o-mini | +| --- | --- | --- | +| test_chat_non_streaming_basic (case 0) | ✅ | ✅ | +| test_chat_non_streaming_basic (case 1) | ✅ | ✅ | +| test_chat_non_streaming_image (case 0) | ✅ | ✅ | +| test_chat_non_streaming_structured_output (case 0) | ✅ | ✅ | +| test_chat_non_streaming_structured_output (case 1) | ✅ | ✅ | +| test_chat_non_streaming_tool_calling (case 0) | ✅ | ✅ | +| test_chat_streaming_basic (case 0) | ✅ | ✅ | +| test_chat_streaming_basic (case 1) | ✅ | ✅ | +| test_chat_streaming_image (case 0) | ✅ | ✅ | +| test_chat_streaming_structured_output (case 0) | ✅ | ✅ | +| test_chat_streaming_structured_output (case 1) | ✅ | ✅ | diff --git a/tests/verifications/__init__.py b/tests/verifications/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/tests/verifications/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/tests/verifications/conftest.py b/tests/verifications/conftest.py new file mode 100644 index 000000000..08967e834 --- /dev/null +++ b/tests/verifications/conftest.py @@ -0,0 +1,28 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +def pytest_addoption(parser): + parser.addoption( + "--base-url", + action="store", + help="Base URL for OpenAI compatible API", + ) + parser.addoption( + "--api-key", + action="store", + help="API key", + ) + parser.addoption( + "--provider", + action="store", + help="Provider to use for testing", + ) + + +pytest_plugins = [ + "tests.verifications.openai.fixtures.fixtures", +] diff --git a/tests/verifications/generate_report.py b/tests/verifications/generate_report.py new file mode 100755 index 000000000..98a5930da --- /dev/null +++ b/tests/verifications/generate_report.py @@ -0,0 +1,485 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +""" +Test Report Generator + +Requirements: + pip install pytest-json-report + +Usage: + # Generate a report using existing test results + python tests/verifications/generate_report.py + + # Run tests and generate a report + python tests/verifications/generate_report.py --run-tests + + # Run tests for specific providers + python tests/verifications/generate_report.py --run-tests --providers fireworks openai + + # Save the report to a custom location + python tests/verifications/generate_report.py --output custom_report.md + + # Clean up old test result files + python tests/verifications/generate_report.py --cleanup +""" + +import argparse +import json +import os +import re +import subprocess +import time +from collections import defaultdict +from pathlib import Path + +# Define the root directory for test results +RESULTS_DIR = Path(__file__).parent / "test_results" +RESULTS_DIR.mkdir(exist_ok=True) + +# Maximum number of test result files to keep per provider +MAX_RESULTS_PER_PROVIDER = 1 + +# Custom order of providers +PROVIDER_ORDER = ["together", "fireworks", "groq", "cerebras", "openai"] + +# Dictionary to store providers and their models (will be populated dynamically) +PROVIDERS = defaultdict(set) + +# Tests will be dynamically extracted from results +ALL_TESTS = set() + + +def run_tests(provider): + """Run pytest for a specific provider and save results""" + print(f"Running tests for provider: {provider}") + + timestamp = int(time.time()) + result_file = RESULTS_DIR / f"{provider}_{timestamp}.json" + temp_json_file = RESULTS_DIR / f"temp_{provider}_{timestamp}.json" + + # Run pytest with JSON output + cmd = [ + "python", + "-m", + "pytest", + "tests/verifications/openai/test_chat_completion.py", + f"--provider={provider}", + "-v", + "--json-report", + f"--json-report-file={temp_json_file}", + ] + + try: + result = subprocess.run(cmd, capture_output=True, text=True) + print(f"Pytest exit code: {result.returncode}") + + # Check if the JSON file was created + if temp_json_file.exists(): + # Read the JSON file and save it to our results format + with open(temp_json_file, "r") as f: + test_results = json.load(f) + + # Save results to our own format with a trailing newline + with open(result_file, "w") as f: + json.dump(test_results, f, indent=2) + f.write("\n") # Add a trailing newline for precommit + + # Clean up temp file + temp_json_file.unlink() + + print(f"Test results saved to {result_file}") + return result_file + else: + print(f"Error: JSON report file not created for {provider}") + print(f"Command stdout: {result.stdout}") + print(f"Command stderr: {result.stderr}") + return None + except Exception as e: + print(f"Error running tests for {provider}: {e}") + return None + + +def parse_results(result_file): + """Parse the test results file and extract pass/fail by model and test""" + if not os.path.exists(result_file): + print(f"Results file does not exist: {result_file}") + return {} + + with open(result_file, "r") as f: + results = json.load(f) + + # Initialize results dictionary + parsed_results = defaultdict(lambda: defaultdict(dict)) + provider = os.path.basename(result_file).split("_")[0] + + # Debug: Print summary of test results + print(f"Test results summary for {provider}:") + print(f"Total tests: {results.get('summary', {}).get('total', 0)}") + print(f"Passed: {results.get('summary', {}).get('passed', 0)}") + print(f"Failed: {results.get('summary', {}).get('failed', 0)}") + print(f"Error: {results.get('summary', {}).get('error', 0)}") + print(f"Skipped: {results.get('summary', {}).get('skipped', 0)}") + + # Extract test results + if "tests" not in results or not results["tests"]: + print(f"No test results found in {result_file}") + return parsed_results + + # Map for normalizing model names + model_name_map = { + "Llama-3.3-8B-Instruct": "Llama-3.3-8B-Instruct", + "Llama-3.3-70B-Instruct": "Llama-3.3-70B-Instruct", + "Llama-3.2-11B-Vision-Instruct": "Llama-3.2-11B-Vision-Instruct", + "Llama-4-Scout-17B-16E": "Llama-4-Scout-17B-16E-Instruct", + "Llama-4-Scout-17B-16E-Instruct": "Llama-4-Scout-17B-16E-Instruct", + "Llama-4-Maverick-17B-128E": "Llama-4-Maverick-17B-128E-Instruct", + "Llama-4-Maverick-17B-128E-Instruct": "Llama-4-Maverick-17B-128E-Instruct", + "gpt-4o": "gpt-4o", + "gpt-4o-mini": "gpt-4o-mini", + } + + # Keep track of all models found for this provider + provider_models = set() + + # Track all unique test cases for each base test + test_case_counts = defaultdict(int) + + # First pass: count the number of cases for each test + for test in results["tests"]: + test_id = test.get("nodeid", "") + + if "call" in test: + test_name = test_id.split("::")[1].split("[")[0] + input_output_match = re.search(r"\[input_output(\d+)-", test_id) + if input_output_match: + test_case_counts[test_name] += 1 + + # Second pass: process the tests with case numbers only for tests with multiple cases + for test in results["tests"]: + test_id = test.get("nodeid", "") + outcome = test.get("outcome", "") + + # Only process tests that have been executed (not setup errors) + if "call" in test: + # Regular test that actually ran + test_name = test_id.split("::")[1].split("[")[0] + + # Extract input_output parameter to differentiate between test cases + input_output_match = re.search(r"\[input_output(\d+)-", test_id) + input_output_index = input_output_match.group(1) if input_output_match else "" + + # Create a more detailed test name with case number only if there are multiple cases + detailed_test_name = test_name + if input_output_index and test_case_counts[test_name] > 1: + detailed_test_name = f"{test_name} (case {input_output_index})" + + # Track all unique test names + ALL_TESTS.add(detailed_test_name) + + # Extract model name from test_id using a more robust pattern + model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id) + if model_match: + raw_model = model_match.group(1) + model = model_name_map.get(raw_model, raw_model) + + # Add to set of known models for this provider + provider_models.add(model) + + # Also update the global PROVIDERS dictionary + PROVIDERS[provider].add(model) + + # Store the result + if outcome == "passed": + parsed_results[provider][model][detailed_test_name] = True + else: + parsed_results[provider][model][detailed_test_name] = False + + print(f"Parsed test result: {detailed_test_name} for model {model}: {outcome}") + elif outcome == "error" and "setup" in test and test.get("setup", {}).get("outcome") == "failed": + # This is a setup failure, which likely means a configuration issue + # Extract the base test name and model name + parts = test_id.split("::") + if len(parts) > 1: + test_name = parts[1].split("[")[0] + + # Extract input_output parameter to differentiate between test cases + input_output_match = re.search(r"\[input_output(\d+)-", test_id) + input_output_index = input_output_match.group(1) if input_output_match else "" + + # Create a more detailed test name with case number only if there are multiple cases + detailed_test_name = test_name + if input_output_index and test_case_counts[test_name] > 1: + detailed_test_name = f"{test_name} (case {input_output_index})" + + if detailed_test_name in ALL_TESTS: + # Use a more robust pattern for model extraction + model_match = re.search(r"\[input_output\d+-([^\]]+)\]", test_id) + if model_match: + raw_model = model_match.group(1) + model = model_name_map.get(raw_model, raw_model) + + # Add to set of known models for this provider + provider_models.add(model) + + # Also update the global PROVIDERS dictionary + PROVIDERS[provider].add(model) + + # Mark setup failures as false (failed) + parsed_results[provider][model][detailed_test_name] = False + print(f"Parsed setup failure: {detailed_test_name} for model {model}") + + # Debug: Print parsed results + if not parsed_results[provider]: + print(f"Warning: No test results parsed for provider {provider}") + else: + for model, tests in parsed_results[provider].items(): + print(f"Model {model}: {len(tests)} test results") + + return parsed_results + + +def cleanup_old_results(): + """Clean up old test result files, keeping only the newest N per provider""" + for provider in PROVIDERS.keys(): + # Get all result files for this provider + provider_files = list(RESULTS_DIR.glob(f"{provider}_*.json")) + + # Sort by timestamp (newest first) + provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True) + + # Remove old files beyond the max to keep + if len(provider_files) > MAX_RESULTS_PER_PROVIDER: + for old_file in provider_files[MAX_RESULTS_PER_PROVIDER:]: + try: + old_file.unlink() + print(f"Removed old result file: {old_file}") + except Exception as e: + print(f"Error removing file {old_file}: {e}") + + +def get_latest_results_by_provider(): + """Get the latest test result file for each provider""" + provider_results = {} + + # Get all result files + result_files = list(RESULTS_DIR.glob("*.json")) + + # Extract all provider names from filenames + all_providers = set() + for file in result_files: + # File format is provider_timestamp.json + parts = file.stem.split("_") + if len(parts) >= 2: + all_providers.add(parts[0]) + + # Group by provider + for provider in all_providers: + provider_files = [f for f in result_files if f.name.startswith(f"{provider}_")] + + # Sort by timestamp (newest first) + provider_files.sort(key=lambda x: int(x.stem.split("_")[1]), reverse=True) + + if provider_files: + provider_results[provider] = provider_files[0] + + return provider_results + + +def generate_report(results_dict, output_file=None): + """Generate the markdown report""" + if output_file is None: + # Default to creating the report in the same directory as this script + output_file = Path(__file__).parent / "REPORT.md" + else: + output_file = Path(output_file) + + # Get the timestamp from result files + provider_timestamps = {} + provider_results = get_latest_results_by_provider() + for provider, result_file in provider_results.items(): + # Extract timestamp from filename (format: provider_timestamp.json) + try: + timestamp_str = result_file.stem.split("_")[1] + timestamp = int(timestamp_str) + formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp)) + provider_timestamps[provider] = formatted_time + except (IndexError, ValueError): + provider_timestamps[provider] = "Unknown" + + # Convert provider model sets to sorted lists + for provider in PROVIDERS: + PROVIDERS[provider] = sorted(PROVIDERS[provider]) + + # Sort tests alphabetically + sorted_tests = sorted(ALL_TESTS) + + report = ["# Test Results Report\n"] + report.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n") + report.append("*This report was generated by running `python tests/verifications/generate_report.py`*\n") + + # Icons for pass/fail + pass_icon = "✅" + fail_icon = "❌" + na_icon = "⚪" + + # Add emoji legend + report.append("## Legend\n") + report.append(f"- {pass_icon} - Test passed") + report.append(f"- {fail_icon} - Test failed") + report.append(f"- {na_icon} - Test not applicable or not run for this model") + report.append("\n") + + # Add a summary section + report.append("## Summary\n") + + # Count total tests and passes + total_tests = 0 + passed_tests = 0 + provider_totals = {} + + # Prepare summary data + for provider in PROVIDERS.keys(): + provider_passed = 0 + provider_total = 0 + + if provider in results_dict: + provider_models = PROVIDERS[provider] + for model in provider_models: + if model in results_dict[provider]: + model_results = results_dict[provider][model] + for test in sorted_tests: + if test in model_results: + provider_total += 1 + total_tests += 1 + if model_results[test]: + provider_passed += 1 + passed_tests += 1 + + provider_totals[provider] = (provider_passed, provider_total) + + # Add summary table + report.append("| Provider | Pass Rate | Tests Passed | Total Tests |") + report.append("| --- | --- | --- | --- |") + + # Use the custom order for summary table + for provider in [p for p in PROVIDER_ORDER if p in PROVIDERS]: + passed, total = provider_totals.get(provider, (0, 0)) + pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A" + report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |") + + # Add providers not in the custom order + for provider in [p for p in PROVIDERS if p not in PROVIDER_ORDER]: + passed, total = provider_totals.get(provider, (0, 0)) + pass_rate = f"{(passed / total * 100):.1f}%" if total > 0 else "N/A" + report.append(f"| {provider.capitalize()} | {pass_rate} | {passed} | {total} |") + + report.append("\n") + + # Process each provider in the custom order, then any additional providers + for provider in sorted( + PROVIDERS.keys(), key=lambda p: (PROVIDER_ORDER.index(p) if p in PROVIDER_ORDER else float("inf"), p) + ): + if not PROVIDERS[provider]: + # Skip providers with no models + continue + + report.append(f"\n## {provider.capitalize()}\n") + + # Add timestamp when test was run + if provider in provider_timestamps: + report.append(f"*Tests run on: {provider_timestamps[provider]}*\n") + + # Add test command for reproducing results + test_cmd = f"pytest tests/verifications/openai/test_chat_completion.py --provider={provider} -v" + report.append(f"```bash\n{test_cmd}\n```\n") + + # Get the relevant models for this provider + provider_models = PROVIDERS[provider] + + # Create table header with models as columns + header = "| Test | " + " | ".join(provider_models) + " |" + separator = "| --- | " + " | ".join(["---"] * len(provider_models)) + " |" + + report.append(header) + report.append(separator) + + # Get results for this provider + provider_results = results_dict.get(provider, {}) + + # Add rows for each test + for test in sorted_tests: + row = f"| {test} |" + + # Add results for each model in this test + for model in provider_models: + if model in provider_results and test in provider_results[model]: + result = pass_icon if provider_results[model][test] else fail_icon + else: + result = na_icon + row += f" {result} |" + + report.append(row) + + # Write to file + with open(output_file, "w") as f: + f.write("\n".join(report)) + f.write("\n") + + print(f"Report generated: {output_file}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate test report") + parser.add_argument("--run-tests", action="store_true", help="Run tests before generating report") + parser.add_argument( + "--providers", + type=str, + nargs="+", + help="Specify providers to test (comma-separated or space-separated, default: all)", + ) + parser.add_argument("--output", type=str, help="Output file location (default: tests/verifications/REPORT.md)") + args = parser.parse_args() + + all_results = {} + + if args.run_tests: + # Get list of available providers from command line or use detected providers + if args.providers: + # Handle both comma-separated and space-separated lists + test_providers = [] + for provider_arg in args.providers: + # Split by comma if commas are present + if "," in provider_arg: + test_providers.extend(provider_arg.split(",")) + else: + test_providers.append(provider_arg) + else: + # Default providers to test + test_providers = PROVIDER_ORDER + + for provider in test_providers: + provider = provider.strip() # Remove any whitespace + result_file = run_tests(provider) + if result_file: + provider_results = parse_results(result_file) + all_results.update(provider_results) + else: + # Use existing results + provider_result_files = get_latest_results_by_provider() + + for result_file in provider_result_files.values(): + provider_results = parse_results(result_file) + all_results.update(provider_results) + + # Generate the report + generate_report(all_results, args.output) + + cleanup_old_results() + + +if __name__ == "__main__": + main() diff --git a/tests/verifications/openai/__init__.py b/tests/verifications/openai/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/tests/verifications/openai/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/tests/verifications/openai/fixtures/__init__.py b/tests/verifications/openai/fixtures/__init__.py new file mode 100644 index 000000000..756f351d8 --- /dev/null +++ b/tests/verifications/openai/fixtures/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. diff --git a/tests/verifications/openai/fixtures/fixtures.py b/tests/verifications/openai/fixtures/fixtures.py new file mode 100644 index 000000000..b86de3662 --- /dev/null +++ b/tests/verifications/openai/fixtures/fixtures.py @@ -0,0 +1,97 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import os + +import pytest +from openai import OpenAI + + +@pytest.fixture +def providers_model_mapping(): + """ + Mapping from model names used in test cases to provider's model names. + """ + return { + "fireworks": { + "Llama-3.3-70B-Instruct": "accounts/fireworks/models/llama-v3p1-70b-instruct", + "Llama-3.2-11B-Vision-Instruct": "accounts/fireworks/models/llama-v3p2-11b-vision-instruct", + "Llama-4-Scout-17B-16E-Instruct": "accounts/fireworks/models/llama4-scout-instruct-basic", + "Llama-4-Maverick-17B-128E-Instruct": "accounts/fireworks/models/llama4-maverick-instruct-basic", + }, + "together": { + "Llama-3.3-70B-Instruct": "meta-llama/Llama-3.3-70B-Instruct-Turbo", + "Llama-3.2-11B-Vision-Instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + "Llama-4-Scout-17B-16E-Instruct": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "Llama-4-Maverick-17B-128E-Instruct": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + }, + "groq": { + "Llama-3.3-70B-Instruct": "llama-3.3-70b-versatile", + "Llama-3.2-11B-Vision-Instruct": "llama-3.2-11b-vision-preview", + "Llama-4-Scout-17B-16E-Instruct": "llama-4-scout-17b-16e-instruct", + "Llama-4-Maverick-17B-128E-Instruct": "llama-4-maverick-17b-128e-instruct", + }, + "cerebras": { + "Llama-3.3-70B-Instruct": "llama-3.3-70b", + }, + "openai": { + "gpt-4o": "gpt-4o", + "gpt-4o-mini": "gpt-4o-mini", + }, + } + + +@pytest.fixture +def provider_metadata(): + return { + "fireworks": ("https://api.fireworks.ai/inference/v1", "FIREWORKS_API_KEY"), + "together": ("https://api.together.xyz/v1", "TOGETHER_API_KEY"), + "groq": ("https://api.groq.com/openai/v1", "GROQ_API_KEY"), + "cerebras": ("https://api.cerebras.ai/v1", "CEREBRAS_API_KEY"), + "openai": ("https://api.openai.com/v1", "OPENAI_API_KEY"), + } + + +@pytest.fixture +def provider(request, provider_metadata): + provider = request.config.getoption("--provider") + base_url = request.config.getoption("--base-url") + + if provider and base_url and provider_metadata[provider][0] != base_url: + raise ValueError(f"Provider {provider} is not supported for base URL {base_url}") + + if not provider: + if not base_url: + raise ValueError("Provider and base URL are not provided") + for provider, metadata in provider_metadata.items(): + if metadata[0] == base_url: + provider = provider + break + + return provider + + +@pytest.fixture +def base_url(request, provider, provider_metadata): + return request.config.getoption("--base-url") or provider_metadata[provider][0] + + +@pytest.fixture +def api_key(request, provider, provider_metadata): + return request.config.getoption("--api-key") or os.getenv(provider_metadata[provider][1]) + + +@pytest.fixture +def model_mapping(provider, providers_model_mapping): + return providers_model_mapping[provider] + + +@pytest.fixture +def openai_client(base_url, api_key): + return OpenAI( + base_url=base_url, + api_key=api_key, + ) diff --git a/tests/verifications/openai/fixtures/load.py b/tests/verifications/openai/fixtures/load.py new file mode 100644 index 000000000..98580b2a1 --- /dev/null +++ b/tests/verifications/openai/fixtures/load.py @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +import yaml + + +def load_test_cases(name: str): + fixture_dir = Path(__file__).parent / "test_cases" + yaml_path = fixture_dir / f"{name}.yaml" + with open(yaml_path, "r") as f: + return yaml.safe_load(f) diff --git a/tests/verifications/openai/fixtures/test_cases/chat_completion.yaml b/tests/verifications/openai/fixtures/test_cases/chat_completion.yaml new file mode 100644 index 000000000..2c302a704 --- /dev/null +++ b/tests/verifications/openai/fixtures/test_cases/chat_completion.yaml @@ -0,0 +1,162 @@ +test_chat_basic: + test_name: test_chat_basic + test_params: + input_output: + - input: + messages: + - content: Which planet do humans live on? + role: user + output: Earth + - input: + messages: + - content: Which planet has rings around it with a name starting with letter + S? + role: user + output: Saturn + model: + - Llama-3.3-8B-Instruct + - Llama-3.3-70B-Instruct + - Llama-4-Scout-17B-16E + - Llama-4-Scout-17B-16E-Instruct + - Llama-4-Maverick-17B-128E + - Llama-4-Maverick-17B-128E-Instruct + - gpt-4o + - gpt-4o-mini +test_chat_image: + test_name: test_chat_image + test_params: + input_output: + - input: + messages: + - content: + - text: What is in this image? + type: text + - image_url: + url: https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg + type: image_url + role: user + output: llama + model: + - Llama-4-Scout-17B-16E + - Llama-4-Scout-17B-16E-Instruct + - Llama-4-Maverick-17B-128E + - Llama-4-Maverick-17B-128E-Instruct + - gpt-4o + - gpt-4o-mini +test_chat_structured_output: + test_name: test_chat_structured_output + test_params: + input_output: + - input: + messages: + - content: Extract the event information. + role: system + - content: Alice and Bob are going to a science fair on Friday. + role: user + response_format: + json_schema: + name: calendar_event + schema: + properties: + date: + title: Date + type: string + name: + title: Name + type: string + participants: + items: + type: string + title: Participants + type: array + required: + - name + - date + - participants + title: CalendarEvent + type: object + type: json_schema + output: valid_calendar_event + - input: + messages: + - content: You are a helpful math tutor. Guide the user through the solution + step by step. + role: system + - content: how can I solve 8x + 7 = -23 + role: user + response_format: + json_schema: + name: math_reasoning + schema: + $defs: + Step: + properties: + explanation: + title: Explanation + type: string + output: + title: Output + type: string + required: + - explanation + - output + title: Step + type: object + properties: + final_answer: + title: Final Answer + type: string + steps: + items: + $ref: '#/$defs/Step' + title: Steps + type: array + required: + - steps + - final_answer + title: MathReasoning + type: object + type: json_schema + output: valid_math_reasoning + model: + - Llama-3.3-8B-Instruct + - Llama-3.3-70B-Instruct + - Llama-4-Scout-17B-16E + - Llama-4-Scout-17B-16E-Instruct + - Llama-4-Maverick-17B-128E + - Llama-4-Maverick-17B-128E-Instruct + - gpt-4o + - gpt-4o-mini +test_tool_calling: + test_name: test_tool_calling + test_params: + input_output: + - input: + messages: + - content: You are a helpful assistant that can use tools to get information. + role: system + - content: What's the weather like in San Francisco? + role: user + tools: + - function: + description: Get current temperature for a given location. + name: get_weather + parameters: + additionalProperties: false + properties: + location: + description: "City and country e.g. Bogot\xE1, Colombia" + type: string + required: + - location + type: object + type: function + output: get_weather_tool_call + model: + - Llama-3.3-70B-Instruct + - Llama-4-Scout-17B-16E + - Llama-4-Scout-17B-16E-Instruct + - Llama-4-Maverick-17B-128E + - Llama-4-Maverick-17B-128E-Instruct + - gpt-4o + - gpt-4o-mini diff --git a/tests/verifications/openai/test_chat_completion.py b/tests/verifications/openai/test_chat_completion.py new file mode 100644 index 000000000..c6a10de7b --- /dev/null +++ b/tests/verifications/openai/test_chat_completion.py @@ -0,0 +1,202 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any + +import pytest +from pydantic import BaseModel + +from tests.verifications.openai.fixtures.load import load_test_cases + +chat_completion_test_cases = load_test_cases("chat_completion") + + +@pytest.fixture +def correct_model_name(model, provider, providers_model_mapping): + """Return the provider-specific model name based on the generic model name.""" + mapping = providers_model_mapping[provider] + if model not in mapping: + pytest.skip(f"Provider {provider} does not support model {model}") + return mapping[model] + + +@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"]) +@pytest.mark.parametrize( + "input_output", + chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"], +) +def test_chat_non_streaming_basic(openai_client, input_output, correct_model_name): + response = openai_client.chat.completions.create( + model=correct_model_name, + messages=input_output["input"]["messages"], + stream=False, + ) + assert response.choices[0].message.role == "assistant" + assert input_output["output"].lower() in response.choices[0].message.content.lower() + + +@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_basic"]["test_params"]["model"]) +@pytest.mark.parametrize( + "input_output", + chat_completion_test_cases["test_chat_basic"]["test_params"]["input_output"], +) +def test_chat_streaming_basic(openai_client, input_output, correct_model_name): + response = openai_client.chat.completions.create( + model=correct_model_name, + messages=input_output["input"]["messages"], + stream=True, + ) + content = "" + for chunk in response: + content += chunk.choices[0].delta.content or "" + + # TODO: add detailed type validation + + assert input_output["output"].lower() in content.lower() + + +@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"]) +@pytest.mark.parametrize( + "input_output", + chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"], +) +def test_chat_non_streaming_image(openai_client, input_output, correct_model_name): + response = openai_client.chat.completions.create( + model=correct_model_name, + messages=input_output["input"]["messages"], + stream=False, + ) + assert response.choices[0].message.role == "assistant" + assert input_output["output"].lower() in response.choices[0].message.content.lower() + + +@pytest.mark.parametrize("model", chat_completion_test_cases["test_chat_image"]["test_params"]["model"]) +@pytest.mark.parametrize( + "input_output", + chat_completion_test_cases["test_chat_image"]["test_params"]["input_output"], +) +def test_chat_streaming_image(openai_client, input_output, correct_model_name): + response = openai_client.chat.completions.create( + model=correct_model_name, + messages=input_output["input"]["messages"], + stream=True, + ) + content = "" + for chunk in response: + content += chunk.choices[0].delta.content or "" + + # TODO: add detailed type validation + + assert input_output["output"].lower() in content.lower() + + +@pytest.mark.parametrize( + "model", + chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"], +) +@pytest.mark.parametrize( + "input_output", + chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"], +) +def test_chat_non_streaming_structured_output(openai_client, input_output, correct_model_name): + response = openai_client.chat.completions.create( + model=correct_model_name, + messages=input_output["input"]["messages"], + response_format=input_output["input"]["response_format"], + stream=False, + ) + + assert response.choices[0].message.role == "assistant" + maybe_json_content = response.choices[0].message.content + + validate_structured_output(maybe_json_content, input_output["output"]) + + +@pytest.mark.parametrize( + "model", + chat_completion_test_cases["test_chat_structured_output"]["test_params"]["model"], +) +@pytest.mark.parametrize( + "input_output", + chat_completion_test_cases["test_chat_structured_output"]["test_params"]["input_output"], +) +def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name): + response = openai_client.chat.completions.create( + model=correct_model_name, + messages=input_output["input"]["messages"], + response_format=input_output["input"]["response_format"], + stream=True, + ) + maybe_json_content = "" + for chunk in response: + maybe_json_content += chunk.choices[0].delta.content or "" + validate_structured_output(maybe_json_content, input_output["output"]) + + +@pytest.mark.parametrize( + "model", + chat_completion_test_cases["test_tool_calling"]["test_params"]["model"], +) +@pytest.mark.parametrize( + "input_output", + chat_completion_test_cases["test_tool_calling"]["test_params"]["input_output"], +) +def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name): + response = openai_client.chat.completions.create( + model=correct_model_name, + messages=input_output["input"]["messages"], + tools=input_output["input"]["tools"], + stream=False, + ) + + assert response.choices[0].message.role == "assistant" + assert len(response.choices[0].message.tool_calls) > 0 + assert input_output["output"] == "get_weather_tool_call" + assert response.choices[0].message.tool_calls[0].function.name == "get_weather" + # TODO: add detailed type validation + + +def get_structured_output(maybe_json_content: str, schema_name: str) -> Any | None: + if schema_name == "valid_calendar_event": + + class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + + try: + calendar_event = CalendarEvent.model_validate_json(maybe_json_content) + return calendar_event + except Exception: + return None + elif schema_name == "valid_math_reasoning": + + class Step(BaseModel): + explanation: str + output: str + + class MathReasoning(BaseModel): + steps: list[Step] + final_answer: str + + try: + math_reasoning = MathReasoning.model_validate_json(maybe_json_content) + return math_reasoning + except Exception: + return None + + return None + + +def validate_structured_output(maybe_json_content: str, schema_name: str) -> None: + structured_output = get_structured_output(maybe_json_content, schema_name) + assert structured_output is not None + if schema_name == "valid_calendar_event": + assert structured_output.name is not None + assert structured_output.date is not None + assert len(structured_output.participants) == 2 + elif schema_name == "valid_math_reasoning": + assert len(structured_output.final_answer) > 0 diff --git a/tests/verifications/test_results/fireworks_1744154308.json b/tests/verifications/test_results/fireworks_1744154308.json new file mode 100644 index 000000000..691f6e474 --- /dev/null +++ b/tests/verifications/test_results/fireworks_1744154308.json @@ -0,0 +1,2744 @@ +{ + "created": 1744154399.039055, + "duration": 87.73799800872803, + "exitcode": 1, + "root": "/Users/erichuang/projects/llama-stack", + "environment": {}, + "summary": { + "skipped": 52, + "passed": 28, + "failed": 3, + "total": 83, + "collected": 83 + }, + "collectors": [ + { + "nodeid": "", + "outcome": "passed", + "result": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py", + "type": "Module" + } + ] + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py", + "outcome": "passed", + "result": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 138 + } + ] + } + ], + "tests": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.17320987500716, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.000177707988768816, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009193749981932342, + "outcome": "passed" + }, + "call": { + "duration": 1.1473859580000862, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00043337501119822264, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01645291701424867, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0002898749662563205, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01562033302616328, + "outcome": "passed" + }, + "call": { + "duration": 0.8782661251025274, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0002795408945530653, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008571124984882772, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0003043749602511525, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00842841702979058, + "outcome": "passed" + }, + "call": { + "duration": 1.3863223339430988, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009970410028472543, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007089875056408346, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00017958390526473522, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005809499998576939, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00016495899762958288, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0119722920935601, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00016962504014372826, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005716291954740882, + "outcome": "passed" + }, + "call": { + "duration": 0.6822018750244752, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005292498972266912, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.025827708072029054, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.000295999925583601, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010980832972563803, + "outcome": "passed" + }, + "call": { + "duration": 0.7537062909686938, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0008091670460999012, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006567832897417247, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001545000122860074, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005985083989799023, + "outcome": "passed" + }, + "call": { + "duration": 0.7263387079583481, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006324589485302567, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0171962499152869, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.000780042028054595, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01365620899014175, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00016758404672145844, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0064070840599015355, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0002031669719144702, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010951624950394034, + "outcome": "passed" + }, + "call": { + "duration": 0.5433399169705808, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0013178749941289425, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.022056750021874905, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0006570409750565886, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008314333041198552, + "outcome": "passed" + }, + "call": { + "duration": 0.7779882500180975, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006799160037189722, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.03601404093205929, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.000610582996159792, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014321292052045465, + "outcome": "passed" + }, + "call": { + "duration": 1.0243758750148118, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0010485410457476974, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.021133000031113625, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0005400830414146185, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007212458993308246, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00026770797558128834, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012334750033915043, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00042683398351073265, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011477917083539069, + "outcome": "passed" + }, + "call": { + "duration": 1.670572166913189, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005759169580414891, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.024620208074338734, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0005166250048205256, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008708957931958139, + "outcome": "passed" + }, + "call": { + "duration": 0.6654335829662159, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0002927089808508754, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.018128167022950947, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001929170684888959, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0063874589977785945, + "outcome": "passed" + }, + "call": { + "duration": 0.8047525839647278, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00039245898369699717, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01366533397231251, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00028241705149412155, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010844790958799422, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.000258082989603281, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00936354196164757, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00020533299539238214, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 60, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008578249951824546, + "outcome": "passed" + }, + "call": { + "duration": 2.6288582499837503, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006052498938515782, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.02061279199551791, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00029320805333554745, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 60, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00995812495239079, + "outcome": "passed" + }, + "call": { + "duration": 3.0904540000483394, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0003214169992133975, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0261635419446975, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00032716698478907347, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.027220541960559785, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0003192499279975891, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010883458075113595, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0002687909873202443, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 75, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0171177500160411, + "outcome": "passed" + }, + "call": { + "duration": 1.6752691670553759, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004877089522778988, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011608208995312452, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00017137499526143074, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 75, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009284624946303666, + "outcome": "passed" + }, + "call": { + "duration": 3.537356249988079, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005068340105935931, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.016660499968566, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00029341597110033035, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o-mini]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01374066702555865, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0002625000197440386, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013120374991558492, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00021954195108264685, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.015080374898388982, + "outcome": "passed" + }, + "call": { + "duration": 1.157175041968003, + "outcome": "passed" + }, + "teardown": { + "duration": 0.000495875021442771, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013946042046882212, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0002954580122604966, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011617792071774602, + "outcome": "passed" + }, + "call": { + "duration": 0.9537639999762177, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004819999448955059, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.027436082949861884, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00030274991877377033, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.016110333963297307, + "outcome": "passed" + }, + "call": { + "duration": 0.8493227910948917, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004883749643340707, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.017850833013653755, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0003287500003352761, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012523208046332002, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00023500004317611456, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007516667013987899, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00018912507221102715, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007337165996432304, + "outcome": "passed" + }, + "call": { + "duration": 3.124099582899362, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006703329272568226, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014259999967180192, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00030262500513345003, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010863124975003302, + "outcome": "passed" + }, + "call": { + "duration": 1.3330956250429153, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00018679199274629354, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005797958001494408, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00017529097385704517, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005647709011100233, + "outcome": "passed" + }, + "call": { + "duration": 3.2295467499643564, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005654999986290932, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007151791942305863, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00015316694043576717, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006435790914110839, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00015954102855175734, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006164791993796825, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00014074996579438448, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010064583038911223, + "outcome": "passed" + }, + "call": { + "duration": 1.1676458748988807, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0002513329964131117, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011011417023837566, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00020608294289559126, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011654542060568929, + "outcome": "passed" + }, + "call": { + "duration": 0.7950789160095155, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0002690000692382455, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0066834589233621955, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00017270795069634914, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011390416999347508, + "outcome": "passed" + }, + "call": { + "duration": 0.7844940840732306, + "outcome": "passed" + }, + "teardown": { + "duration": 0.000511458027176559, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005813500029034913, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00015495799016207457, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0075639160349965096, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00014358304906636477, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008526541059836745, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00015841599088162184, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 117, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007805416011251509, + "outcome": "passed" + }, + "call": { + "duration": 13.25898533302825, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 196, + "message": "assert None is not None" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 136, + "message": "" + }, + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 196, + "message": "AssertionError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solution step by step.',... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\ncorrect_model_name = 'accounts/fireworks/models/llama-v3p1-70b-instruct'\n\n @pytest.mark.parametrize(\n \"model\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"model\"],\n )\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n response_format=input_output[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n maybe_json_content += chunk.choices[0].delta.content or \"\"\n> validate_structured_output(maybe_json_content, input_output[\"output\"])\n\ntests/verifications/openai/test_chat_completion.py:136: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \n\nmaybe_json_content = '{ \"final_answer\": \"}To solve the equation 8x + 7 = -23, we need to isolate the variable x. We can do this by followin...tassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistantassistant'\nschema_name = 'valid_math_reasoning'\n\n def validate_structured_output(maybe_json_content: str, schema_name: str) -> None:\n structured_output = get_structured_output(maybe_json_content, schema_name)\n> assert structured_output is not None\nE assert None is not None\n\ntests/verifications/openai/test_chat_completion.py:196: AssertionError" + }, + "teardown": { + "duration": 0.00022583396639674902, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006412541959434748, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0001449589617550373, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010353000019676983, + "outcome": "passed" + }, + "call": { + "duration": 4.559281209018081, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00021179206669330597, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011320417048409581, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001623749267309904, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005637791007757187, + "outcome": "passed" + }, + "call": { + "duration": 2.9282109580235556, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00019149994477629662, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.021475916961207986, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0002605828922241926, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012046082993037999, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00016966694965958595, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 138, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00782629195600748, + "outcome": "passed" + }, + "call": { + "duration": 0.9290615000063553, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004110001027584076, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00842183397617191, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider fireworks does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00023745803628116846, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 138, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010762874968349934, + "outcome": "passed" + }, + "call": { + "duration": 23.62101216695737, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 156, + "message": "TypeError: object of type 'NoneType' has no len()" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 156, + "message": "TypeError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\ncorrect_model_name = 'accounts/fireworks/models/llama4-scout-instruct-basic'\n\n @pytest.mark.parametrize(\n \"model\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"model\"],\n )\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n tools=input_output[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai/test_chat_completion.py:156: TypeError" + }, + "teardown": { + "duration": 0.0004520840011537075, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00953104195650667, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider fireworks does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00017912499606609344, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 138, + "outcome": "failed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010302042006514966, + "outcome": "passed" + }, + "call": { + "duration": 5.55651158397086, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 156, + "message": "TypeError: object of type 'NoneType' has no len()" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 156, + "message": "TypeError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'You are a helpful assistant that can use tools to get information.', 'role': 'sys..., 'properties': {...}, 'required': [...], 'type': 'object'}}, 'type': 'function'}]}, 'output': 'get_weather_tool_call'}\ncorrect_model_name = 'accounts/fireworks/models/llama4-maverick-instruct-basic'\n\n @pytest.mark.parametrize(\n \"model\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"model\"],\n )\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_tool_calling\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_non_streaming_tool_calling(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n tools=input_output[\"input\"][\"tools\"],\n stream=False,\n )\n \n assert response.choices[0].message.role == \"assistant\"\n> assert len(response.choices[0].message.tool_calls) > 0\nE TypeError: object of type 'NoneType' has no len()\n\ntests/verifications/openai/test_chat_completion.py:156: TypeError" + }, + "teardown": { + "duration": 0.0003929579397663474, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01593891705852002, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider fireworks does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0003579579060897231, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01874550001230091, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider fireworks does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00031995808240026236, + "outcome": "passed" + } + } + ] +} diff --git a/tests/verifications/test_results/openai_1744154522.json b/tests/verifications/test_results/openai_1744154522.json new file mode 100644 index 000000000..310f3500d --- /dev/null +++ b/tests/verifications/test_results/openai_1744154522.json @@ -0,0 +1,2672 @@ +{ + "created": 1744154576.251519, + "duration": 51.50739002227783, + "exitcode": 0, + "root": "/Users/erichuang/projects/llama-stack", + "environment": {}, + "summary": { + "skipped": 61, + "passed": 22, + "total": 83, + "collected": 83 + }, + "collectors": [ + { + "nodeid": "", + "outcome": "passed", + "result": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py", + "type": "Module" + } + ] + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py", + "outcome": "passed", + "result": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 138 + } + ] + } + ], + "tests": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0531630830373615, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0001657919492572546, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006063499953597784, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.00014004099648445845, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005356832989491522, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00016508297994732857, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006139832898043096, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.00014450005255639553, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00542324990965426, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00014112505596131086, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.004965625004842877, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.00013720791321247816, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005054084002040327, + "outcome": "passed" + }, + "call": { + "duration": 0.6271341659594327, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00043925002682954073, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0159178749890998, + "outcome": "passed" + }, + "call": { + "duration": 0.44088316697161645, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006467089988291264, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.016705541987903416, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0005769169656559825, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012067249976098537, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.00016683305148035288, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009295083000324667, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00017204193864017725, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009534333017654717, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.00020175008103251457, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006628665956668556, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0003687090938910842, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0061322919791564345, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.0003664169926196337, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00623433303553611, + "outcome": "passed" + }, + "call": { + "duration": 0.7898445830214769, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006602079374715686, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014758958015590906, + "outcome": "passed" + }, + "call": { + "duration": 1.1555478329537436, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0011781250359490514, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.03454475000035018, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.000967124942690134, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.025206666090525687, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.000189624959602952, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014331333106383681, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00023133307695388794, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009339665994048119, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.00020329200197011232, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010387042071670294, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00018254201859235764, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012297999928705394, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.00018662505317479372, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006984042003750801, + "outcome": "passed" + }, + "call": { + "duration": 0.32529433304443955, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0033042499562725425, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01832079200539738, + "outcome": "passed" + }, + "call": { + "duration": 0.48440287495031953, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00047233293298631907, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.02893691696226597, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0001747499918565154, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006553041050210595, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.00016829196829348803, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013746666954830289, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00019237503875046968, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007175332983024418, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.0001873329747468233, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006127291941083968, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00019004102796316147, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006421791040338576, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.0001611249754205346, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009806249989196658, + "outcome": "passed" + }, + "call": { + "duration": 0.9556747920578346, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004937920020893216, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.03146500000730157, + "outcome": "passed" + }, + "call": { + "duration": 1.082494750036858, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0006242080125957727, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.021534667001105845, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0003469999646767974, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.025929750059731305, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.0008774169255048037, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012507125036790967, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00022008304949849844, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008156375028192997, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.0002079169498756528, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o]", + "lineno": 60, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_image[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012587749981321394, + "outcome": "passed" + }, + "call": { + "duration": 2.7379885419504717, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00044579198583960533, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "lineno": 60, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.017111250082962215, + "outcome": "passed" + }, + "call": { + "duration": 2.599374584038742, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009177909232676029, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.02198700001463294, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00042749999556690454, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.015032917028293014, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.00041016703471541405, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013976250076666474, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00027600000612437725, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00799729092977941, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.00020320899784564972, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o]", + "lineno": 75, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_image[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010483540943823755, + "outcome": "passed" + }, + "call": { + "duration": 4.249965250026435, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0008596250554546714, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o-mini]", + "lineno": 75, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_image[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.018141582957468927, + "outcome": "passed" + }, + "call": { + "duration": 2.297856790944934, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005075830267742276, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.017144332989118993, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0006829580524936318, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009827250032685697, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.00024204188957810402, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006737958989106119, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00022729102056473494, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006030917051248252, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.00022229203023016453, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009183833957649767, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00022629194427281618, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007097500027157366, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.00826825003605336, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006604874972254038, + "outcome": "passed" + }, + "call": { + "duration": 1.4057738750707358, + "outcome": "passed" + }, + "teardown": { + "duration": 0.000506040989421308, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.015966624952852726, + "outcome": "passed" + }, + "call": { + "duration": 0.540478374925442, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009536249563097954, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.020631707971915603, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0004928340204060078, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.016745459055528045, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.0003412909572944045, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012252667103894055, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00028650008607655764, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01128904102370143, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.00027041707653552294, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009191332967020571, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0002074999501928687, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007687666919082403, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.0002027079463005066, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007542708073742688, + "outcome": "passed" + }, + "call": { + "duration": 4.244797708000988, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0012778330128639936, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.026919999974779785, + "outcome": "passed" + }, + "call": { + "duration": 9.006108874920756, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00046324997674673796, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01554666692391038, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0004023330984637141, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007354958914220333, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.0002900830004364252, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.017274250043556094, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0002668329980224371, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006813667016103864, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.00024500000290572643, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007385291974060237, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00017024995759129524, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00857366609852761, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.00016850000247359276, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005570041947066784, + "outcome": "passed" + }, + "call": { + "duration": 0.8564215000951663, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004029169213026762, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00786762498319149, + "outcome": "passed" + }, + "call": { + "duration": 0.6419672920601442, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005102079594507813, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.017147499951533973, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00032350001856684685, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01194737502373755, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.0005004579434171319, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010250666993670166, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00022554199676960707, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007847042055800557, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.000283458037301898, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008078000042587519, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001794169656932354, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007204750087112188, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.00017725001089274883, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006797667010687292, + "outcome": "passed" + }, + "call": { + "duration": 5.411579457926564, + "outcome": "passed" + }, + "teardown": { + "duration": 0.001134666963480413, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.025059624924324453, + "outcome": "passed" + }, + "call": { + "duration": 9.112342999898829, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009202499641105533, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.024287916952744126, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider openai does not support model Llama-3.3-70B-Instruct')" + }, + "teardown": { + "duration": 0.00015587499365210533, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006531457998789847, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00014670798555016518, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006190375075675547, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider openai does not support model Llama-4-Scout-17B-16E-Instruct')" + }, + "teardown": { + "duration": 0.0001603750279173255, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005670750048011541, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001479999627918005, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005662833107635379, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider openai does not support model Llama-4-Maverick-17B-128E-Instruct')" + }, + "teardown": { + "duration": 0.0001480829669162631, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "lineno": 138, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00573637499473989, + "outcome": "passed" + }, + "call": { + "duration": 0.6269576249178499, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0010142088867723942, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "lineno": 138, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01623620803002268, + "outcome": "passed" + }, + "call": { + "duration": 0.7144521250156686, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0011040839599445462, + "outcome": "passed" + } + } + ] +} diff --git a/tests/verifications/test_results/together_1744154399.json b/tests/verifications/test_results/together_1744154399.json new file mode 100644 index 000000000..ae801e83b --- /dev/null +++ b/tests/verifications/test_results/together_1744154399.json @@ -0,0 +1,2830 @@ +{ + "created": 1744154470.9868789, + "duration": 59.6187219619751, + "exitcode": 1, + "root": "/Users/erichuang/projects/llama-stack", + "environment": {}, + "summary": { + "skipped": 52, + "passed": 21, + "failed": 10, + "total": 83, + "collected": 83 + }, + "collectors": [ + { + "nodeid": "", + "outcome": "passed", + "result": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py", + "type": "Module" + } + ] + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py", + "outcome": "passed", + "result": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 25 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 40 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 60 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 75 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 95 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "type": "Function", + "lineno": 117 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "type": "Function", + "lineno": 138 + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "type": "Function", + "lineno": 138 + } + ] + } + ], + "tests": [ + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.39231995795853436, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0002014160854741931, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0071710830088704824, + "outcome": "passed" + }, + "call": { + "duration": 0.7968309168936685, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004362498875707388, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012780916062183678, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00029158301185816526, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013563874992541969, + "outcome": "passed" + }, + "call": { + "duration": 0.5071627920260653, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005456249928101897, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.020708917058072984, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00030325003899633884, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014170082984492183, + "outcome": "passed" + }, + "call": { + "duration": 1.2383921250002459, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009597090538591146, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013402250013314188, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00028245802968740463, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008693707990460098, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00016249995678663254, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005904874997213483, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0001960420049726963, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006532749976031482, + "outcome": "passed" + }, + "call": { + "duration": 0.5410778749501333, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00019516597967594862, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009374375105835497, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00015524995978921652, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007205875008367002, + "outcome": "passed" + }, + "call": { + "duration": 0.42584729101508856, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009506250498816371, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.029625958995893598, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001860830234363675, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 25, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.023576707928441465, + "outcome": "passed" + }, + "call": { + "duration": 1.2249365829629824, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004278330598026514, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014816291979514062, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00029558304231613874, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "lineno": 25, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_basic[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012769333901815116, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 26, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00024329195730388165, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009145625052042305, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00021195888984948397, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0133140409598127, + "outcome": "passed" + }, + "call": { + "duration": 0.7228892090497538, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004301250446587801, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013998750015161932, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0002961249556392431, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 40, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012570249964483082, + "outcome": "passed" + }, + "call": { + "duration": 0.7193170419195667, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\ncorrect_model_name = 'meta-llama/Llama-4-Scout-17B-16E-Instruct'\n\n @pytest.mark.parametrize(\"model\", chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"model\"])\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_basic(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:54: IndexError" + }, + "teardown": { + "duration": 0.00022504094522446394, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006660082959569991, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001445829402655363, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 40, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_basic[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.021228999947197735, + "outcome": "passed" + }, + "call": { + "duration": 1.5670281670754775, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'Which planet do humans live on?', 'role': 'user'}]}, 'output': 'Earth'}\ncorrect_model_name = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\n\n @pytest.mark.parametrize(\"model\", chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"model\"])\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_basic(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:54: IndexError" + }, + "teardown": { + "duration": 0.0004656669916585088, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009595917072147131, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00025625003036111593, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009242708911187947, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0002484159776940942, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00905474997125566, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00023312494158744812, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 40, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007183165987953544, + "outcome": "passed" + }, + "call": { + "duration": 1.0667660840554163, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005163750611245632, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.05233616603072733, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0003471659729257226, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 40, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.015932541922666132, + "outcome": "passed" + }, + "call": { + "duration": 0.41540695796720684, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\ncorrect_model_name = 'meta-llama/Llama-4-Scout-17B-16E-Instruct'\n\n @pytest.mark.parametrize(\"model\", chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"model\"])\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_basic(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:54: IndexError" + }, + "teardown": { + "duration": 0.0002845840062946081, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007243875064887106, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00016258296091109514, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 40, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_basic[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009275624994188547, + "outcome": "passed" + }, + "call": { + "duration": 1.43309554096777, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 54, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'Which planet has rings around it with a name starting with letter S?', 'role': 'user'}]}, 'output': 'Saturn'}\ncorrect_model_name = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\n\n @pytest.mark.parametrize(\"model\", chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"model\"])\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_basic\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_basic(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:54: IndexError" + }, + "teardown": { + "duration": 0.0003690000157803297, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011570582981221378, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00024937500711530447, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "lineno": 40, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_basic[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010756584000773728, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 41, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00026183295994997025, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.008863041992299259, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00023283297196030617, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 60, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.007975792046636343, + "outcome": "passed" + }, + "call": { + "duration": 2.1585817909799516, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005107080796733499, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.05228079203516245, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0017226670170202851, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 60, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009964749915525317, + "outcome": "passed" + }, + "call": { + "duration": 4.6593364590080455, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009852920193225145, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.023214041953906417, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0003567079547792673, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "lineno": 60, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_image[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01705008395947516, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 61, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0003085409989580512, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014711958006955683, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0003121249610558152, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 75, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01843333407305181, + "outcome": "passed" + }, + "call": { + "duration": 2.8683876669965684, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 89, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 89, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\ncorrect_model_name = 'meta-llama/Llama-4-Scout-17B-16E-Instruct'\n\n @pytest.mark.parametrize(\"model\", chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"model\"])\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_image(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:89: IndexError" + }, + "teardown": { + "duration": 0.00028662499971687794, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00653208396397531, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.00021291698794811964, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 75, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_image[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.006028458010405302, + "outcome": "passed" + }, + "call": { + "duration": 4.981105040991679, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 89, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 89, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': [{'text': 'What is in this image?', 'type': 'text'}, {'image_url': {...}, 'type': 'image_url'}], 'role': 'user'}]}, 'output': 'llama'}\ncorrect_model_name = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\n\n @pytest.mark.parametrize(\"model\", chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"model\"])\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_image\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_image(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n stream=True,\n )\n content = \"\"\n for chunk in response:\n> content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:89: IndexError" + }, + "teardown": { + "duration": 0.0010110830189660192, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01591233303770423, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0003783750580623746, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_image[input_output0-gpt-4o-mini]", + "lineno": 75, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_image[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010691000032238662, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 76, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00027445796877145767, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01258529198821634, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.0002044580178335309, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010904791066423059, + "outcome": "passed" + }, + "call": { + "duration": 0.8311828339938074, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00048687495291233063, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.029216791968792677, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0002269580727443099, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.013182583032175899, + "outcome": "passed" + }, + "call": { + "duration": 1.7446029160637408, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0008087089518085122, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.02009516698308289, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.000320291961543262, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.015216833096928895, + "outcome": "passed" + }, + "call": { + "duration": 0.8049291669158265, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005109170451760292, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0171551660168916, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0005707499803975224, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01131124992389232, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0003044159384444356, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0054290409898385406, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00014645792543888092, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011368000064976513, + "outcome": "passed" + }, + "call": { + "duration": 4.363120499998331, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0003998749889433384, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.04945958300959319, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0002401659730821848, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.011090958025306463, + "outcome": "passed" + }, + "call": { + "duration": 4.699277375009842, + "outcome": "passed" + }, + "teardown": { + "duration": 0.000689250067807734, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.020744459005072713, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0001836250303313136, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 95, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005926624988205731, + "outcome": "passed" + }, + "call": { + "duration": 2.7814464160474017, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0009554170537739992, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.03027112502604723, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.0003245410043746233, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "lineno": 95, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_structured_output[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.009138708002865314, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 96, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0001919999485835433, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0064505410846322775, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00015720794908702374, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00582624995149672, + "outcome": "passed" + }, + "call": { + "duration": 0.8302567919017747, + "outcome": "passed" + }, + "teardown": { + "duration": 0.00020354206208139658, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.014151416951790452, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.00034970801789313555, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 117, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012150791939347982, + "outcome": "passed" + }, + "call": { + "duration": 0.7078855830477551, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'content': 'Alice and Bob ar...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\ncorrect_model_name = 'meta-llama/Llama-4-Scout-17B-16E-Instruct'\n\n @pytest.mark.parametrize(\n \"model\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"model\"],\n )\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n response_format=input_output[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:135: IndexError" + }, + "teardown": { + "duration": 0.0008542909054085612, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.022667833953164518, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0006820419803261757, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 117, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.01285991701297462, + "outcome": "passed" + }, + "call": { + "duration": 0.6888671671040356, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'Extract the event information.', 'role': 'system'}, {'content': 'Alice and Bob ar...articipants'], 'title': 'CalendarEvent', 'type': 'object'}}, 'type': 'json_schema'}}, 'output': 'valid_calendar_event'}\ncorrect_model_name = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\n\n @pytest.mark.parametrize(\n \"model\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"model\"],\n )\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n response_format=input_output[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:135: IndexError" + }, + "teardown": { + "duration": 0.0007953330641612411, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.015029000001959503, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00015666603576391935, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.00622316705994308, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0001533749746158719, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-3.3-8B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-8B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005598834017291665, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model Llama-3.3-8B-Instruct')" + }, + "teardown": { + "duration": 0.00013062497600913048, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "lineno": 117, + "outcome": "passed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.005876541952602565, + "outcome": "passed" + }, + "call": { + "duration": 7.561108374968171, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0004579999949783087, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.018791542039252818, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0004900830099359155, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 117, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0065952910808846354, + "outcome": "passed" + }, + "call": { + "duration": 2.6826554159633815, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solution step by step.',... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\ncorrect_model_name = 'meta-llama/Llama-4-Scout-17B-16E-Instruct'\n\n @pytest.mark.parametrize(\n \"model\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"model\"],\n )\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n response_format=input_output[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:135: IndexError" + }, + "teardown": { + "duration": 0.0009669580031186342, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.019489208003506064, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0007419160101562738, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 117, + "outcome": "failed", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output1-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012299792026169598, + "outcome": "passed" + }, + "call": { + "duration": 2.829678333015181, + "outcome": "failed", + "crash": { + "path": "/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError: list index out of range" + }, + "traceback": [ + { + "path": "tests/verifications/openai/test_chat_completion.py", + "lineno": 135, + "message": "IndexError" + } + ], + "longrepr": "openai_client = \ninput_output = {'input': {'messages': [{'content': 'You are a helpful math tutor. Guide the user through the solution step by step.',... ['steps', 'final_answer'], 'title': 'MathReasoning', ...}}, 'type': 'json_schema'}}, 'output': 'valid_math_reasoning'}\ncorrect_model_name = 'meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'\n\n @pytest.mark.parametrize(\n \"model\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"model\"],\n )\n @pytest.mark.parametrize(\n \"input_output\",\n chat_completion_test_cases[\"test_chat_structured_output\"][\"test_params\"][\"input_output\"],\n )\n def test_chat_streaming_structured_output(openai_client, input_output, correct_model_name):\n response = openai_client.chat.completions.create(\n model=correct_model_name,\n messages=input_output[\"input\"][\"messages\"],\n response_format=input_output[\"input\"][\"response_format\"],\n stream=True,\n )\n maybe_json_content = \"\"\n for chunk in response:\n> maybe_json_content += chunk.choices[0].delta.content or \"\"\nE IndexError: list index out of range\n\ntests/verifications/openai/test_chat_completion.py:135: IndexError" + }, + "teardown": { + "duration": 0.0010418329620733857, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-gpt-4o]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.016189916990697384, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.00027966592460870743, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "lineno": 117, + "outcome": "skipped", + "keywords": [ + "test_chat_streaming_structured_output[input_output1-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output1-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.010247125057503581, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 118, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.00023291702382266521, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "lineno": 138, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-3.3-70B-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-3.3-70B-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012632582918740809, + "outcome": "passed" + }, + "call": { + "duration": 0.40774812502786517, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0007319580763578415, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.019890791969373822, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider together does not support model Llama-4-Scout-17B-16E')" + }, + "teardown": { + "duration": 0.0006391670322045684, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "lineno": 138, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Scout-17B-16E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Scout-17B-16E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.0178165000397712, + "outcome": "passed" + }, + "call": { + "duration": 0.38229950005188584, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0010000420734286308, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.024259291938506067, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider together does not support model Llama-4-Maverick-17B-128E')" + }, + "teardown": { + "duration": 0.0003602079814299941, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "lineno": 138, + "outcome": "passed", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-Llama-4-Maverick-17B-128E-Instruct]", + "parametrize", + "pytestmark", + "input_output0-Llama-4-Maverick-17B-128E-Instruct", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012425708002410829, + "outcome": "passed" + }, + "call": { + "duration": 0.7610744580160826, + "outcome": "passed" + }, + "teardown": { + "duration": 0.0005935420049354434, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-gpt-4o]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.018717541941441596, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider together does not support model gpt-4o')" + }, + "teardown": { + "duration": 0.000659791985526681, + "outcome": "passed" + } + }, + { + "nodeid": "tests/verifications/openai/test_chat_completion.py::test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "lineno": 138, + "outcome": "skipped", + "keywords": [ + "test_chat_non_streaming_tool_calling[input_output0-gpt-4o-mini]", + "parametrize", + "pytestmark", + "input_output0-gpt-4o-mini", + "test_chat_completion.py", + "openai", + "verifications", + "tests", + "llama-stack", + "" + ], + "setup": { + "duration": 0.012784749967977405, + "outcome": "skipped", + "longrepr": "('/Users/erichuang/projects/llama-stack/tests/verifications/openai/test_chat_completion.py', 139, 'Skipped: Provider together does not support model gpt-4o-mini')" + }, + "teardown": { + "duration": 0.0002145830076187849, + "outcome": "passed" + } + } + ] +} From 983f6feeb8eae327adb8273ecf0f309985fcf648 Mon Sep 17 00:00:00 2001 From: AlexHe99 Date: Wed, 9 Apr 2025 12:35:32 +0800 Subject: [PATCH 13/24] docs: Update remote-vllm.md with AMD GPU vLLM server supported. (#1858) Add the content to use AMD GPU as the vLLM server. Split the original part to two sub chapters, 1. AMD vLLM server 2. NVIDIA vLLM server (orignal) # What does this PR do? [Provide a short summary of what this PR does and why. Link to relevant issues if applicable.] [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*] [//]: # (## Documentation) --------- Signed-off-by: Alex He --- .../self_hosted_distro/remote-vllm.md | 74 +++++++++++++++++++ .../templates/remote-vllm/doc_template.md | 74 +++++++++++++++++++ 2 files changed, 148 insertions(+) diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md index b6e8a8ad4..457d703b3 100644 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md @@ -41,6 +41,80 @@ The following environment variables can be configured: ## Setting up vLLM server +Both AMD and NVIDIA GPUs can serve as accelerators for the vLLM server, which acts as both the LLM inference provider and the safety provider. + +### Setting up vLLM server on AMD GPU + +AMD provides two main vLLM container options: +- rocm/vllm: Production-ready container +- rocm/vllm-dev: Development container with the latest vLLM features + +Please check the [Blog about ROCm vLLM Usage](https://rocm.blogs.amd.com/software-tools-optimization/vllm-container/README.html) to get more details. + +Here is a sample script to start a ROCm vLLM server locally via Docker: + +```bash +export INFERENCE_PORT=8000 +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export CUDA_VISIBLE_DEVICES=0 +export VLLM_DIMG="rocm/vllm-dev:main" + +docker run \ + --pull always \ + --ipc=host \ + --privileged \ + --shm-size 16g \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --cap-add=SYS_PTRACE \ + --cap-add=CAP_SYS_ADMIN \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + $VLLM_DIMG \ + python -m vllm.entrypoints.openai.api_server \ + --model $INFERENCE_MODEL \ + --port $INFERENCE_PORT +``` + +Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html). + +If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: + +```bash +export SAFETY_PORT=8081 +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +export CUDA_VISIBLE_DEVICES=1 +export VLLM_DIMG="rocm/vllm-dev:main" + +docker run \ + --pull always \ + --ipc=host \ + --privileged \ + --shm-size 16g \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --cap-add=SYS_PTRACE \ + --cap-add=CAP_SYS_ADMIN \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ + -p $SAFETY_PORT:$SAFETY_PORT \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + $VLLM_DIMG \ + python -m vllm.entrypoints.openai.api_server \ + --model $SAFETY_MODEL \ + --port $SAFETY_PORT +``` + +### Setting up vLLM server on NVIDIA GPU + Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker: ```bash diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md index 57c9f116c..7543e8239 100644 --- a/llama_stack/templates/remote-vllm/doc_template.md +++ b/llama_stack/templates/remote-vllm/doc_template.md @@ -28,6 +28,80 @@ The following environment variables can be configured: ## Setting up vLLM server +Both AMD and NVIDIA GPUs can serve as accelerators for the vLLM server, which acts as both the LLM inference provider and the safety provider. + +### Setting up vLLM server on AMD GPU + +AMD provides two main vLLM container options: +- rocm/vllm: Production-ready container +- rocm/vllm-dev: Development container with the latest vLLM features + +Please check the [Blog about ROCm vLLM Usage](https://rocm.blogs.amd.com/software-tools-optimization/vllm-container/README.html) to get more details. + +Here is a sample script to start a ROCm vLLM server locally via Docker: + +```bash +export INFERENCE_PORT=8000 +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export CUDA_VISIBLE_DEVICES=0 +export VLLM_DIMG="rocm/vllm-dev:main" + +docker run \ + --pull always \ + --ipc=host \ + --privileged \ + --shm-size 16g \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --cap-add=SYS_PTRACE \ + --cap-add=CAP_SYS_ADMIN \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + $VLLM_DIMG \ + python -m vllm.entrypoints.openai.api_server \ + --model $INFERENCE_MODEL \ + --port $INFERENCE_PORT +``` + +Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html). + +If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: + +```bash +export SAFETY_PORT=8081 +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +export CUDA_VISIBLE_DEVICES=1 +export VLLM_DIMG="rocm/vllm-dev:main" + +docker run \ + --pull always \ + --ipc=host \ + --privileged \ + --shm-size 16g \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --cap-add=SYS_PTRACE \ + --cap-add=CAP_SYS_ADMIN \ + --security-opt seccomp=unconfined \ + --security-opt apparmor=unconfined \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ + -p $SAFETY_PORT:$SAFETY_PORT \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + $VLLM_DIMG \ + python -m vllm.entrypoints.openai.api_server \ + --model $SAFETY_MODEL \ + --port $SAFETY_PORT +``` + +### Setting up vLLM server on NVIDIA GPU + Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker: ```bash From 10882bf478bcb8d89babeb9416ab24fb39385d20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 9 Apr 2025 09:43:48 +0200 Subject: [PATCH 14/24] chore: remove unused tempdir in agent (#1896) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? The usage of the tempdir was removed in 094eb6a5ae8dbac297fe59914db11c612250f92f. Signed-off-by: Sébastien Han --- .../providers/inline/agents/meta_reference/agent_instance.py | 2 -- llama_stack/providers/inline/agents/meta_reference/agents.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py index 6840da89f..f441d6eb6 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agent_instance.py +++ b/llama_stack/providers/inline/agents/meta_reference/agent_instance.py @@ -89,7 +89,6 @@ class ChatAgent(ShieldRunnerMixin): self, agent_id: str, agent_config: AgentConfig, - tempdir: str, inference_api: Inference, safety_api: Safety, tool_runtime_api: ToolRuntime, @@ -99,7 +98,6 @@ class ChatAgent(ShieldRunnerMixin): ): self.agent_id = agent_id self.agent_config = agent_config - self.tempdir = tempdir self.inference_api = inference_api self.safety_api = safety_api self.vector_io_api = vector_io_api diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py index 5ca123595..656178773 100644 --- a/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -7,7 +7,6 @@ import json import logging import shutil -import tempfile import uuid from typing import AsyncGenerator, List, Optional, Union @@ -64,7 +63,6 @@ class MetaReferenceAgentsImpl(Agents): self.tool_groups_api = tool_groups_api self.in_memory_store = InmemoryKVStoreImpl() - self.tempdir = tempfile.mkdtemp() async def initialize(self) -> None: self.persistence_store = await kvstore_impl(self.config.persistence_store) @@ -107,7 +105,6 @@ class MetaReferenceAgentsImpl(Agents): return ChatAgent( agent_id=agent_id, agent_config=agent_config, - tempdir=self.tempdir, inference_api=self.inference_api, safety_api=self.safety_api, vector_io_api=self.vector_io_api, From 8001c30a4fe12ca15c79ca7b7038c30d1d7b181f Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Wed, 9 Apr 2025 00:46:02 -0700 Subject: [PATCH 15/24] fix: meta reference + llama4 tokenizer fix --- llama_stack/models/llama/llama4/tokenizer.py | 4 +++- .../providers/inline/inference/meta_reference/generators.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llama_stack/models/llama/llama4/tokenizer.py b/llama_stack/models/llama/llama4/tokenizer.py index 4d271e5fd..8eabc3205 100644 --- a/llama_stack/models/llama/llama4/tokenizer.py +++ b/llama_stack/models/llama/llama4/tokenizer.py @@ -56,9 +56,11 @@ LLAMA4_TEXT_POST_TRAIN_SPECIAL_TOKENS = [ "<|text_post_train_reserved_special_token_3|>", "<|text_post_train_reserved_special_token_4|>", "<|text_post_train_reserved_special_token_5|>", + "<|text_post_train_reserved_special_token_6|>", + "<|text_post_train_reserved_special_token_7|>", "<|finetune_right_pad|>", ] + get_reserved_special_tokens( - "text_post_train", 61, 6 + "text_post_train", 61, 8 ) # <|text_post_train_reserved_special_token_6|>, ..., <|text_post_train_reserved_special_token_66|> # 200080, ..., 201133 diff --git a/llama_stack/providers/inline/inference/meta_reference/generators.py b/llama_stack/providers/inline/inference/meta_reference/generators.py index 65bed4d8c..34dd58a9a 100644 --- a/llama_stack/providers/inline/inference/meta_reference/generators.py +++ b/llama_stack/providers/inline/inference/meta_reference/generators.py @@ -259,7 +259,7 @@ class Llama3Generator: temperature, top_p = _infer_sampling_params(sampling_params) for result in self.inner_generator.generate( - llm_inputs=[self.formatter.encode_content(request.content)], + model_inputs=[self.formatter.encode_content(request.content)], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, @@ -284,7 +284,7 @@ class Llama3Generator: temperature, top_p = _infer_sampling_params(sampling_params) for result in self.inner_generator.generate( - llm_inputs=[self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))], + model_inputs=[self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))], max_gen_len=max_gen_len, temperature=temperature, top_p=top_p, From e3d22d8de733b1786087fba85920695c40e15777 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Wed, 9 Apr 2025 04:10:07 -0400 Subject: [PATCH 16/24] chore: fix hash for thollander/actions-comment-pull-request (#1900) # What does this PR do? Fix hash for v3.0.1 tag for a github action. Signed-off-by: Ihar Hrachyshka --- .github/workflows/gha_workflow_llama_stack_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gha_workflow_llama_stack_tests.yml b/.github/workflows/gha_workflow_llama_stack_tests.yml index 91b9d2f3b..9eae291e9 100644 --- a/.github/workflows/gha_workflow_llama_stack_tests.yml +++ b/.github/workflows/gha_workflow_llama_stack_tests.yml @@ -320,7 +320,7 @@ jobs: - name: "PR - Update comment" id: pr_update_comment if: github.event_name == 'pull_request_target' - uses: thollander/actions-comment-pull-request@65f9e5c9a1f2cd378bd74b2e057c9736982a8e74 # v3.0.1 + uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1 with: filePath: test-summary.md From 45e210fd0c43ee76a93f21275575f3cbd83a70f6 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Wed, 9 Apr 2025 01:09:16 -0700 Subject: [PATCH 17/24] fix: llama3 bf16 model load --- llama_stack/models/llama/llama3/generation.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llama_stack/models/llama/llama3/generation.py b/llama_stack/models/llama/llama3/generation.py index ee99a07ba..8c6aa242b 100644 --- a/llama_stack/models/llama/llama3/generation.py +++ b/llama_stack/models/llama/llama3/generation.py @@ -119,17 +119,16 @@ class Llama3: torch.set_default_device(device) else: print(f"Setting default device to {device}") - torch.set_default_device(device) if device.type == "cuda": if torch.cuda.is_bf16_supported(): - torch.set_default_dtype(torch.bfloat16) + torch.set_default_tensor_type(torch.cuda.BFloat16Tensor) else: - torch.set_default_dtype(torch.half) + torch.set_default_tensor_type(torch.cuda.Float16Tensor) elif device.type == "xpu": if torch.xpu.is_bf16_supported(): - torch.set_default_dtype(torch.bfloat16) + torch.set_default_tensor_type(torch.xpu.BFloat16Tensor) else: - torch.set_default_dtype(torch.half) + torch.set_default_tensor_type(torch.xpu.Float16Tensor) model = build_model() print("Loading state dict...") From 389767010b0333c49cf6cb86122308a5ec474621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Han?= Date: Wed, 9 Apr 2025 10:30:41 +0200 Subject: [PATCH 18/24] feat: ability to execute external providers (#1672) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? Providers that live outside of the llama-stack codebase are now supported. A new property `external_providers_dir` has been added to the main config and can be configured as follow: ``` external_providers_dir: /etc/llama-stack/providers.d/ ``` Where the expected structure is: ``` providers.d/ inference/ custom_ollama.yaml vllm.yaml vector_io/ qdrant.yaml ``` Where `custom_ollama.yaml` is: ``` adapter: adapter_type: custom_ollama pip_packages: ["ollama", "aiohttp"] config_class: llama_stack_ollama_provider.config.OllamaImplConfig module: llama_stack_ollama_provider api_dependencies: [] optional_api_dependencies: [] ``` Obviously the package must be installed on the system, here is the `llama_stack_ollama_provider` example: ``` $ uv pip show llama-stack-ollama-provider Using Python 3.10.16 environment at: /Users/leseb/Documents/AI/llama-stack/.venv Name: llama-stack-ollama-provider Version: 0.1.0 Location: /Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.10/site-packages Editable project location: /private/var/folders/mq/rnm5w_7s2d3fxmtkx02knvhm0000gn/T/tmp.ZBHU5Ezxg4/ollama/llama-stack-ollama-provider Requires: Required-by: ``` Closes: https://github.com/meta-llama/llama-stack/issues/658 Signed-off-by: Sébastien Han --- .github/workflows/test-external-providers.yml | 93 +++++++ docs/source/providers/external.md | 234 ++++++++++++++++++ docs/source/providers/index.md | 5 + llama_stack/distribution/datatypes.py | 5 + llama_stack/distribution/distribution.py | 130 +++++++++- llama_stack/distribution/resolver.py | 1 + llama_stack/distribution/stack.py | 2 +- .../llama-stack-provider-ollama/README.md | 3 + .../custom_ollama.yaml | 7 + .../pyproject.toml | 44 ++++ .../llama-stack-provider-ollama/run.yaml | 135 ++++++++++ tests/unit/distribution/test_distribution.py | 223 +++++++++++++++++ 12 files changed, 875 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/test-external-providers.yml create mode 100644 docs/source/providers/external.md create mode 100644 tests/external-provider/llama-stack-provider-ollama/README.md create mode 100644 tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml create mode 100644 tests/external-provider/llama-stack-provider-ollama/pyproject.toml create mode 100644 tests/external-provider/llama-stack-provider-ollama/run.yaml create mode 100644 tests/unit/distribution/test_distribution.py diff --git a/.github/workflows/test-external-providers.yml b/.github/workflows/test-external-providers.yml new file mode 100644 index 000000000..2ead8f845 --- /dev/null +++ b/.github/workflows/test-external-providers.yml @@ -0,0 +1,93 @@ +name: Test External Providers + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test-external-providers: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.10" + + - name: Install Ollama + run: | + curl -fsSL https://ollama.com/install.sh | sh + + - name: Pull Ollama image + run: | + ollama pull llama3.2:3b-instruct-fp16 + + - name: Start Ollama in background + run: | + nohup ollama run llama3.2:3b-instruct-fp16 --keepalive=30m > ollama.log 2>&1 & + + - name: Set Up Environment and Install Dependencies + run: | + uv sync --extra dev --extra test + uv pip install -e . + + - name: Install Ollama custom provider + run: | + mkdir -p tests/external-provider/llama-stack-provider-ollama/src/ + cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama + uv pip install tests/external-provider/llama-stack-provider-ollama + + - name: Create provider configuration + run: | + mkdir -p /tmp/providers.d/remote/inference + cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /tmp/providers.d/remote/inference/custom_ollama.yaml + + - name: Wait for Ollama to start + run: | + echo "Waiting for Ollama..." + for i in {1..30}; do + if curl -s http://localhost:11434 | grep -q "Ollama is running"; then + echo "Ollama is running!" + exit 0 + fi + sleep 1 + done + echo "Ollama failed to start" + ollama ps + ollama.log + exit 1 + + - name: Start Llama Stack server in background + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + run: | + source .venv/bin/activate + nohup uv run llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type venv > server.log 2>&1 & + + - name: Wait for Llama Stack server to be ready + run: | + echo "Waiting for Llama Stack server..." + for i in {1..30}; do + if curl -s http://localhost:8321/v1/health | grep -q "OK"; then + echo "Llama Stack server is up!" + if grep -q "remote::custom_ollama from /tmp/providers.d/remote/inference/custom_ollama.yaml" server.log; then + echo "Llama Stack server is using custom Ollama provider" + exit 0 + else + echo "Llama Stack server is not using custom Ollama provider" + exit 1 + fi + fi + sleep 1 + done + echo "Llama Stack server failed to start" + cat server.log + exit 1 + + - name: run inference tests + run: | + uv run pytest -v tests/integration/inference/test_text_inference.py --stack-config="http://localhost:8321" --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2 diff --git a/docs/source/providers/external.md b/docs/source/providers/external.md new file mode 100644 index 000000000..90fc77979 --- /dev/null +++ b/docs/source/providers/external.md @@ -0,0 +1,234 @@ +# External Providers + +Llama Stack supports external providers that live outside of the main codebase. This allows you to: +- Create and maintain your own providers independently +- Share providers with others without contributing to the main codebase +- Keep provider-specific code separate from the core Llama Stack code + +## Configuration + +To enable external providers, you need to configure the `external_providers_dir` in your Llama Stack configuration. This directory should contain your external provider specifications: + +```yaml +external_providers_dir: /etc/llama-stack/providers.d/ +``` + +## Directory Structure + +The external providers directory should follow this structure: + +``` +providers.d/ + remote/ + inference/ + custom_ollama.yaml + vllm.yaml + vector_io/ + qdrant.yaml + safety/ + llama-guard.yaml + inline/ + inference/ + custom_ollama.yaml + vllm.yaml + vector_io/ + qdrant.yaml + safety/ + llama-guard.yaml +``` + +Each YAML file in these directories defines a provider specification for that particular API. + +## Provider Types + +Llama Stack supports two types of external providers: + +1. **Remote Providers**: Providers that communicate with external services (e.g., cloud APIs) +2. **Inline Providers**: Providers that run locally within the Llama Stack process + +## Known External Providers + +Here's a list of known external providers that you can use with Llama Stack: + +| Type | Name | Description | Repository | +|------|------|-------------|------------| +| Remote | KubeFlow Training | Train models with KubeFlow | [llama-stack-provider-kft](https://github.com/opendatahub-io/llama-stack-provider-kft) | + +### Remote Provider Specification + +Remote providers are used when you need to communicate with external services. Here's an example for a custom Ollama provider: + +```yaml +adapter: + adapter_type: custom_ollama + pip_packages: + - ollama + - aiohttp + config_class: llama_stack_ollama_provider.config.OllamaImplConfig + module: llama_stack_ollama_provider +api_dependencies: [] +optional_api_dependencies: [] +``` + +#### Adapter Configuration + +The `adapter` section defines how to load and configure the provider: + +- `adapter_type`: A unique identifier for this adapter +- `pip_packages`: List of Python packages required by the provider +- `config_class`: The full path to the configuration class +- `module`: The Python module containing the provider implementation + +### Inline Provider Specification + +Inline providers run locally within the Llama Stack process. Here's an example for a custom vector store provider: + +```yaml +module: llama_stack_vector_provider +config_class: llama_stack_vector_provider.config.VectorStoreConfig +pip_packages: + - faiss-cpu + - numpy +api_dependencies: + - inference +optional_api_dependencies: + - vector_io +provider_data_validator: llama_stack_vector_provider.validator.VectorStoreValidator +container_image: custom-vector-store:latest # optional +``` + +#### Inline Provider Fields + +- `module`: The Python module containing the provider implementation +- `config_class`: The full path to the configuration class +- `pip_packages`: List of Python packages required by the provider +- `api_dependencies`: List of Llama Stack APIs that this provider depends on +- `optional_api_dependencies`: List of optional Llama Stack APIs that this provider can use +- `provider_data_validator`: Optional validator for provider data +- `container_image`: Optional container image to use instead of pip packages + +## Required Implementation + +### Remote Providers + +Remote providers must expose a `get_adapter_impl()` function in their module that takes two arguments: +1. `config`: An instance of the provider's config class +2. `deps`: A dictionary of API dependencies + +This function must return an instance of the provider's adapter class that implements the required protocol for the API. + +Example: +```python +async def get_adapter_impl( + config: OllamaImplConfig, deps: Dict[Api, Any] +) -> OllamaInferenceAdapter: + return OllamaInferenceAdapter(config) +``` + +### Inline Providers + +Inline providers must expose a `get_provider_impl()` function in their module that takes two arguments: +1. `config`: An instance of the provider's config class +2. `deps`: A dictionary of API dependencies + +Example: +```python +async def get_provider_impl( + config: VectorStoreConfig, deps: Dict[Api, Any] +) -> VectorStoreImpl: + impl = VectorStoreImpl(config, deps[Api.inference]) + await impl.initialize() + return impl +``` + +## Dependencies + +The provider package must be installed on the system. For example: + +```bash +$ uv pip show llama-stack-ollama-provider +Name: llama-stack-ollama-provider +Version: 0.1.0 +Location: /path/to/venv/lib/python3.10/site-packages +``` + +## Example: Custom Ollama Provider + +Here's a complete example of creating and using a custom Ollama provider: + +1. First, create the provider package: + +```bash +mkdir -p llama-stack-provider-ollama +cd llama-stack-provider-ollama +git init +uv init +``` + +2. Edit `pyproject.toml`: + +```toml +[project] +name = "llama-stack-provider-ollama" +version = "0.1.0" +description = "Ollama provider for Llama Stack" +requires-python = ">=3.10" +dependencies = ["llama-stack", "pydantic", "ollama", "aiohttp"] +``` + +3. Create the provider specification: + +```yaml +# /etc/llama-stack/providers.d/remote/inference/custom_ollama.yaml +adapter: + adapter_type: custom_ollama + pip_packages: ["ollama", "aiohttp"] + config_class: llama_stack_provider_ollama.config.OllamaImplConfig + module: llama_stack_provider_ollama +api_dependencies: [] +optional_api_dependencies: [] +``` + +4. Install the provider: + +```bash +uv pip install -e . +``` + +5. Configure Llama Stack to use external providers: + +```yaml +external_providers_dir: /etc/llama-stack/providers.d/ +``` + +The provider will now be available in Llama Stack with the type `remote::custom_ollama`. + +## Best Practices + +1. **Package Naming**: Use the prefix `llama-stack-provider-` for your provider packages to make them easily identifiable. + +2. **Version Management**: Keep your provider package versioned and compatible with the Llama Stack version you're using. + +3. **Dependencies**: Only include the minimum required dependencies in your provider package. + +4. **Documentation**: Include clear documentation in your provider package about: + - Installation requirements + - Configuration options + - Usage examples + - Any limitations or known issues + +5. **Testing**: Include tests in your provider package to ensure it works correctly with Llama Stack. +You can refer to the [integration tests +guide](https://github.com/meta-llama/llama-stack/blob/main/tests/integration/README.md) for more +information. Execute the test for the Provider type you are developing. + +## Troubleshooting + +If your external provider isn't being loaded: + +1. Check that the `external_providers_dir` path is correct and accessible. +2. Verify that the YAML files are properly formatted. +3. Ensure all required Python packages are installed. +4. Check the Llama Stack server logs for any error messages - turn on debug logging to get more + information using `LLAMA_STACK_LOGGING=all=debug`. +5. Verify that the provider package is installed in your Python environment. diff --git a/docs/source/providers/index.md b/docs/source/providers/index.md index f8997a281..75faf7c00 100644 --- a/docs/source/providers/index.md +++ b/docs/source/providers/index.md @@ -11,6 +11,10 @@ Providers come in two flavors: Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally. +## External Providers + +Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently. See the [External Providers Guide](external) for details. + ## Agents Run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc. @@ -50,6 +54,7 @@ The following providers (i.e., databases) are available for Vector IO: ```{toctree} :maxdepth: 1 +external vector_io/faiss vector_io/sqlite-vec vector_io/chromadb diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index 48f1925dd..b24b0ec50 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -312,6 +312,11 @@ a default SQLite store will be used.""", description="Configuration for the HTTP(S) server", ) + external_providers_dir: Optional[str] = Field( + default=None, + description="Path to directory containing external provider implementations. The providers code and dependencies must be installed on the system.", + ) + class BuildConfig(BaseModel): version: str = LLAMA_STACK_BUILD_CONFIG_VERSION diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index ddb727663..d4447139c 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -4,12 +4,25 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. +import glob import importlib -from typing import Dict, List +import os +from typing import Any, Dict, List +import yaml from pydantic import BaseModel -from llama_stack.providers.datatypes import Api, ProviderSpec +from llama_stack.distribution.datatypes import StackRunConfig +from llama_stack.log import get_logger +from llama_stack.providers.datatypes import ( + AdapterSpec, + Api, + InlineProviderSpec, + ProviderSpec, + remote_provider_spec, +) + +logger = get_logger(name=__name__, category="core") def stack_apis() -> List[Api]: @@ -59,11 +72,116 @@ def providable_apis() -> List[Api]: return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers] -def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]: - ret = {} +def _load_remote_provider_spec(spec_data: Dict[str, Any], api: Api) -> ProviderSpec: + adapter = AdapterSpec(**spec_data["adapter"]) + spec = remote_provider_spec( + api=api, + adapter=adapter, + api_dependencies=[Api(dep) for dep in spec_data.get("api_dependencies", [])], + ) + return spec + + +def _load_inline_provider_spec(spec_data: Dict[str, Any], api: Api, provider_name: str) -> ProviderSpec: + spec = InlineProviderSpec( + api=api, + provider_type=f"inline::{provider_name}", + pip_packages=spec_data.get("pip_packages", []), + module=spec_data["module"], + config_class=spec_data["config_class"], + api_dependencies=[Api(dep) for dep in spec_data.get("api_dependencies", [])], + optional_api_dependencies=[Api(dep) for dep in spec_data.get("optional_api_dependencies", [])], + provider_data_validator=spec_data.get("provider_data_validator"), + container_image=spec_data.get("container_image"), + ) + return spec + + +def get_provider_registry(config: StackRunConfig | None = None) -> Dict[Api, Dict[str, ProviderSpec]]: + """Get the provider registry, optionally including external providers. + + This function loads both built-in providers and external providers from YAML files. + External providers are loaded from a directory structure like: + + providers.d/ + remote/ + inference/ + custom_ollama.yaml + vllm.yaml + vector_io/ + qdrant.yaml + safety/ + llama-guard.yaml + inline/ + inference/ + custom_ollama.yaml + vllm.yaml + vector_io/ + qdrant.yaml + safety/ + llama-guard.yaml + + Args: + config: Optional StackRunConfig containing the external providers directory path + + Returns: + A dictionary mapping APIs to their available providers + + Raises: + FileNotFoundError: If the external providers directory doesn't exist + ValueError: If any provider spec is invalid + """ + + ret: Dict[Api, Dict[str, ProviderSpec]] = {} for api in providable_apis(): name = api.name.lower() - module = importlib.import_module(f"llama_stack.providers.registry.{name}") - ret[api] = {a.provider_type: a for a in module.available_providers()} + logger.debug(f"Importing module {name}") + try: + module = importlib.import_module(f"llama_stack.providers.registry.{name}") + ret[api] = {a.provider_type: a for a in module.available_providers()} + except ImportError as e: + logger.warning(f"Failed to import module {name}: {e}") + if config and config.external_providers_dir: + external_providers_dir = os.path.abspath(config.external_providers_dir) + if not os.path.exists(external_providers_dir): + raise FileNotFoundError(f"External providers directory not found: {external_providers_dir}") + logger.info(f"Loading external providers from {external_providers_dir}") + + for api in providable_apis(): + api_name = api.name.lower() + + # Process both remote and inline providers + for provider_type in ["remote", "inline"]: + api_dir = os.path.join(external_providers_dir, provider_type, api_name) + if not os.path.exists(api_dir): + logger.debug(f"No {provider_type} provider directory found for {api_name}") + continue + + # Look for provider spec files in the API directory + for spec_path in glob.glob(os.path.join(api_dir, "*.yaml")): + provider_name = os.path.splitext(os.path.basename(spec_path))[0] + logger.info(f"Loading {provider_type} provider spec from {spec_path}") + + try: + with open(spec_path) as f: + spec_data = yaml.safe_load(f) + + if provider_type == "remote": + spec = _load_remote_provider_spec(spec_data, api) + provider_type_key = f"remote::{provider_name}" + else: + spec = _load_inline_provider_spec(spec_data, api, provider_name) + provider_type_key = f"inline::{provider_name}" + + logger.info(f"Loaded {provider_type} provider spec for {provider_type_key} from {spec_path}") + if provider_type_key in ret[api]: + logger.warning(f"Overriding already registered provider {provider_type_key} for {api.name}") + ret[api][provider_type_key] = spec + except yaml.YAMLError as yaml_err: + logger.error(f"Failed to parse YAML file {spec_path}: {yaml_err}") + raise yaml_err + except Exception as e: + logger.error(f"Failed to load provider spec from {spec_path}: {e}") + raise e return ret diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index 25fe3f184..33ad343ec 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -351,6 +351,7 @@ async def instantiate_provider( if not hasattr(provider_spec, "module"): raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute") + logger.debug(f"Instantiating provider {provider.provider_id} from {provider_spec.module}") module = importlib.import_module(provider_spec.module) args = [] if isinstance(provider_spec, RemoteProviderSpec): diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index 9c9289a77..d70878db4 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -218,7 +218,7 @@ async def construct_stack( run_config: StackRunConfig, provider_registry: Optional[ProviderRegistry] = None ) -> Dict[Api, Any]: dist_registry, _ = await create_dist_registry(run_config.metadata_store, run_config.image_name) - impls = await resolve_impls(run_config, provider_registry or get_provider_registry(), dist_registry) + impls = await resolve_impls(run_config, provider_registry or get_provider_registry(run_config), dist_registry) await register_resources(run_config, impls) return impls diff --git a/tests/external-provider/llama-stack-provider-ollama/README.md b/tests/external-provider/llama-stack-provider-ollama/README.md new file mode 100644 index 000000000..8bd2b6a87 --- /dev/null +++ b/tests/external-provider/llama-stack-provider-ollama/README.md @@ -0,0 +1,3 @@ +# Ollama external provider for Llama Stack + +Template code to create a new external provider for Llama Stack. diff --git a/tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml b/tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml new file mode 100644 index 000000000..f0960b4d8 --- /dev/null +++ b/tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml @@ -0,0 +1,7 @@ +adapter: + adapter_type: custom_ollama + pip_packages: ["ollama", "aiohttp"] + config_class: llama_stack_provider_ollama.config.OllamaImplConfig + module: llama_stack_provider_ollama +api_dependencies: [] +optional_api_dependencies: [] diff --git a/tests/external-provider/llama-stack-provider-ollama/pyproject.toml b/tests/external-provider/llama-stack-provider-ollama/pyproject.toml new file mode 100644 index 000000000..ddebc54b0 --- /dev/null +++ b/tests/external-provider/llama-stack-provider-ollama/pyproject.toml @@ -0,0 +1,44 @@ +[project] +dependencies = [ + "llama-stack", + "pydantic", + "ollama", + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "mcp", + "nltk", + "numpy", + "openai", + "opentelemetry-exporter-otlp-proto-http", + "opentelemetry-sdk", + "pandas", + "pillow", + "psycopg2-binary", + "pymongo", + "pypdf", + "redis", + "requests", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "tree_sitter", + "uvicorn", +] + +name = "llama-stack-provider-ollama" +version = "0.1.0" +description = "External provider for Ollama using the Llama Stack API" +readme = "README.md" +requires-python = ">=3.10" diff --git a/tests/external-provider/llama-stack-provider-ollama/run.yaml b/tests/external-provider/llama-stack-provider-ollama/run.yaml new file mode 100644 index 000000000..7a3636c4d --- /dev/null +++ b/tests/external-provider/llama-stack-provider-ollama/run.yaml @@ -0,0 +1,135 @@ +version: '2' +image_name: ollama +apis: +- agents +- datasetio +- eval +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: custom_ollama + provider_type: remote::custom_ollama + config: + url: ${env.OLLAMA_URL:http://localhost:11434} + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console,sqlite} + sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:} + max_results: 3 + - provider_id: code-interpreter + provider_type: inline::code-interpreter + config: {} + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} + - provider_id: wolfram-alpha + provider_type: remote::wolfram-alpha + config: + api_key: ${env.WOLFRAM_ALPHA_API_KEY:} +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: custom_ollama + model_type: llm +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: custom_ollama + provider_model_id: all-minilm:latest + model_type: embedding +shields: [] +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +- toolgroup_id: builtin::code_interpreter + provider_id: code-interpreter +- toolgroup_id: builtin::wolfram_alpha + provider_id: wolfram-alpha +server: + port: 8321 +external_providers_dir: /tmp/providers.d diff --git a/tests/unit/distribution/test_distribution.py b/tests/unit/distribution/test_distribution.py new file mode 100644 index 000000000..a4daffb82 --- /dev/null +++ b/tests/unit/distribution/test_distribution.py @@ -0,0 +1,223 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict +from unittest.mock import patch + +import pytest +import yaml +from pydantic import BaseModel, Field, ValidationError + +from llama_stack.distribution.datatypes import Api, Provider, StackRunConfig +from llama_stack.distribution.distribution import get_provider_registry +from llama_stack.providers.datatypes import ProviderSpec + + +class SampleConfig(BaseModel): + foo: str = Field( + default="bar", + description="foo", + ) + + @classmethod + def sample_run_config(cls, **kwargs: Any) -> Dict[str, Any]: + return { + "foo": "baz", + } + + +@pytest.fixture +def mock_providers(): + """Mock the available_providers function to return test providers.""" + with patch("llama_stack.providers.registry.inference.available_providers") as mock: + mock.return_value = [ + ProviderSpec( + provider_type="test_provider", + api=Api.inference, + adapter_type="test_adapter", + config_class="test_provider.config.TestProviderConfig", + ) + ] + yield mock + + +@pytest.fixture +def base_config(tmp_path): + """Create a base StackRunConfig with common settings.""" + return StackRunConfig( + image_name="test_image", + providers={ + "inference": [ + Provider( + provider_id="sample_provider", + provider_type="sample", + config=SampleConfig.sample_run_config(), + ) + ] + }, + external_providers_dir=str(tmp_path), + ) + + +@pytest.fixture +def provider_spec_yaml(): + """Common provider spec YAML for testing.""" + return """ +adapter: + adapter_type: test_provider + config_class: test_provider.config.TestProviderConfig + module: test_provider +api_dependencies: + - safety +""" + + +@pytest.fixture +def inline_provider_spec_yaml(): + """Common inline provider spec YAML for testing.""" + return """ +module: test_provider +config_class: test_provider.config.TestProviderConfig +pip_packages: + - test-package +api_dependencies: + - safety +optional_api_dependencies: + - vector_io +provider_data_validator: test_provider.validator.TestValidator +container_image: test-image:latest +""" + + +@pytest.fixture +def api_directories(tmp_path): + """Create the API directory structure for testing.""" + # Create remote provider directory + remote_inference_dir = tmp_path / "remote" / "inference" + remote_inference_dir.mkdir(parents=True, exist_ok=True) + + # Create inline provider directory + inline_inference_dir = tmp_path / "inline" / "inference" + inline_inference_dir.mkdir(parents=True, exist_ok=True) + + return remote_inference_dir, inline_inference_dir + + +class TestProviderRegistry: + """Test suite for provider registry functionality.""" + + def test_builtin_providers(self, mock_providers): + """Test loading built-in providers.""" + registry = get_provider_registry(None) + + assert Api.inference in registry + assert "test_provider" in registry[Api.inference] + assert registry[Api.inference]["test_provider"].provider_type == "test_provider" + assert registry[Api.inference]["test_provider"].api == Api.inference + + def test_external_remote_providers(self, api_directories, mock_providers, base_config, provider_spec_yaml): + """Test loading external remote providers from YAML files.""" + remote_dir, _ = api_directories + with open(remote_dir / "test_provider.yaml", "w") as f: + f.write(provider_spec_yaml) + + registry = get_provider_registry(base_config) + assert len(registry[Api.inference]) == 2 + + assert Api.inference in registry + assert "remote::test_provider" in registry[Api.inference] + provider = registry[Api.inference]["remote::test_provider"] + assert provider.adapter.adapter_type == "test_provider" + assert provider.adapter.module == "test_provider" + assert provider.adapter.config_class == "test_provider.config.TestProviderConfig" + assert Api.safety in provider.api_dependencies + + def test_external_inline_providers(self, api_directories, mock_providers, base_config, inline_provider_spec_yaml): + """Test loading external inline providers from YAML files.""" + _, inline_dir = api_directories + with open(inline_dir / "test_provider.yaml", "w") as f: + f.write(inline_provider_spec_yaml) + + registry = get_provider_registry(base_config) + assert len(registry[Api.inference]) == 2 + + assert Api.inference in registry + assert "inline::test_provider" in registry[Api.inference] + provider = registry[Api.inference]["inline::test_provider"] + assert provider.provider_type == "inline::test_provider" + assert provider.module == "test_provider" + assert provider.config_class == "test_provider.config.TestProviderConfig" + assert provider.pip_packages == ["test-package"] + assert Api.safety in provider.api_dependencies + assert Api.vector_io in provider.optional_api_dependencies + assert provider.provider_data_validator == "test_provider.validator.TestValidator" + assert provider.container_image == "test-image:latest" + + def test_invalid_yaml(self, api_directories, mock_providers, base_config): + """Test handling of invalid YAML files.""" + remote_dir, inline_dir = api_directories + with open(remote_dir / "invalid.yaml", "w") as f: + f.write("invalid: yaml: content: -") + with open(inline_dir / "invalid.yaml", "w") as f: + f.write("invalid: yaml: content: -") + + with pytest.raises(yaml.YAMLError): + get_provider_registry(base_config) + + def test_missing_directory(self, mock_providers): + """Test handling of missing external providers directory.""" + config = StackRunConfig( + image_name="test_image", + providers={ + "inference": [ + Provider( + provider_id="sample_provider", + provider_type="sample", + config=SampleConfig.sample_run_config(), + ) + ] + }, + external_providers_dir="/nonexistent/dir", + ) + with pytest.raises(FileNotFoundError): + get_provider_registry(config) + + def test_empty_api_directory(self, api_directories, mock_providers, base_config): + """Test handling of empty API directory.""" + registry = get_provider_registry(base_config) + assert len(registry[Api.inference]) == 1 # Only built-in provider + + def test_malformed_remote_provider_spec(self, api_directories, mock_providers, base_config): + """Test handling of malformed remote provider spec (missing required fields).""" + remote_dir, _ = api_directories + malformed_spec = """ +adapter: + adapter_type: test_provider + # Missing required fields +api_dependencies: + - safety +""" + with open(remote_dir / "malformed.yaml", "w") as f: + f.write(malformed_spec) + + with pytest.raises(ValidationError): + get_provider_registry(base_config) + + def test_malformed_inline_provider_spec(self, api_directories, mock_providers, base_config): + """Test handling of malformed inline provider spec (missing required fields).""" + _, inline_dir = api_directories + malformed_spec = """ +module: test_provider +# Missing required config_class +pip_packages: + - test-package +""" + with open(inline_dir / "malformed.yaml", "w") as f: + f.write(malformed_spec) + + with pytest.raises(KeyError) as exc_info: + get_provider_registry(base_config) + assert "config_class" in str(exc_info.value) From 3a9be58523254b7f471e636822c55963be41801e Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 9 Apr 2025 04:34:26 -0400 Subject: [PATCH 19/24] fix: use ollama list to find models (#1854) # What does this PR do? closes #1853 ## Test Plan ``` uv run llama stack build --image-type conda --image-name ollama --config llama_stack/templates/ollama/build.yaml ollama pull llama3.2:3b LLAMA_STACK_CONFIG=http://localhost:8321 uv run pytest tests/integration/inference/test_text_inference.py -v --text-model=llama3.2:3b ``` --- llama_stack/providers/remote/inference/ollama/ollama.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index 5a78c07cc..12902996b 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -307,9 +307,10 @@ class OllamaInferenceAdapter(Inference, ModelsProtocolPrivate): if model.model_type == ModelType.embedding: logger.info(f"Pulling embedding model `{model.provider_resource_id}` if necessary...") await self.client.pull(model.provider_resource_id) - response = await self.client.list() - else: - response = await self.client.ps() + # we use list() here instead of ps() - + # - ps() only lists running models, not available models + # - models not currently running are run by the ollama server as needed + response = await self.client.list() available_models = [m["model"] for m in response["models"]] if model.provider_resource_id not in available_models: raise ValueError( From a2cf2999066aa583f6e356a6580862184916a998 Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 9 Apr 2025 04:35:19 -0400 Subject: [PATCH 20/24] fix: update getting started guide to use `ollama pull` (#1855) # What does this PR do? download the getting started w/ ollama model instead of downloading and running it. directly running it was necessary before https://github.com/meta-llama/llama-stack/pull/1854 ## Test Plan run the code on the page --- docs/source/getting_started/index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index ef258a9cf..e9ad51961 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -6,13 +6,13 @@ Llama Stack is a stateful service with REST APIs to support seamless transition In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/) to run inference on a Llama Model. -### 1. Start Ollama +### 1. Download a Llama model with Ollama ```bash -ollama run llama3.2:3b --keepalive 60m +ollama pull llama3.2:3b-instruct-fp16 ``` -By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime. +This will instruct the Ollama service to download the Llama 3.2 3B Instruct model, which we'll use in the rest of this guide. ```{admonition} Note :class: tip From 22814299b00ecd4fbd996d8a631aef6645818e6e Mon Sep 17 00:00:00 2001 From: Paolo Dettori Date: Wed, 9 Apr 2025 04:56:07 -0400 Subject: [PATCH 21/24] fix: solve unregister_toolgroup error (#1608) # What does this PR do? Fixes issue #1537 that causes "500 Internal Server Error" when unregistering a toolgroup # (Closes #1537 ) ## Test Plan ```console $ pytest -s -v tests/integration/tool_runtime/test_registration.py --stack-config=ollama --env INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" INFO 2025-03-14 21:15:03,999 tests.integration.conftest:41 tests: Setting DISABLE_CODE_SANDBOX=1 for macOS /opt/homebrew/lib/python3.10/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset. The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session" warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET)) ===================================================== test session starts ===================================================== platform darwin -- Python 3.10.16, pytest-8.3.5, pluggy-1.5.0 -- /opt/homebrew/opt/python@3.10/bin/python3.10 cachedir: .pytest_cache rootdir: /Users/paolo/Projects/aiplatform/llama-stack configfile: pyproject.toml plugins: asyncio-0.25.3, anyio-4.8.0 asyncio: mode=strict, asyncio_default_fixture_loop_scope=None collected 1 item tests/integration/tool_runtime/test_registration.py::test_register_and_unregister_toolgroup[None-None-None-None-None] INFO 2025-03-14 21:15:04,478 llama_stack.providers.remote.inference.ollama.ollama:75 inference: checking connectivity to Ollama at `http://localhost:11434`... INFO 2025-03-14 21:15:05,350 llama_stack.providers.remote.inference.ollama.ollama:294 inference: Pulling embedding model `all-minilm:latest` if necessary... INFO: Started server process [78391] INFO: Waiting for application startup. INFO: Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) INFO: 127.0.0.1:57424 - "GET /sse HTTP/1.1" 200 OK INFO: 127.0.0.1:57434 - "GET /sse HTTP/1.1" 200 OK INFO 2025-03-14 21:15:16,129 mcp.client.sse:51 uncategorized: Connecting to SSE endpoint: http://localhost:8000/sse INFO: 127.0.0.1:57445 - "GET /sse HTTP/1.1" 200 OK INFO 2025-03-14 21:15:16,146 mcp.client.sse:71 uncategorized: Received endpoint URL: http://localhost:8000/messages/?session_id=c5b6fc01f8dc4b5e80e38eb1c1b22a9b INFO 2025-03-14 21:15:16,147 mcp.client.sse:140 uncategorized: Starting post writer with endpoint URL: http://localhost:8000/messages/?session_id=c5b6fc01f8dc4b5e80e38eb1c1b22a9b INFO: 127.0.0.1:57447 - "POST /messages/?session_id=c5b6fc01f8dc4b5e80e38eb1c1b22a9b HTTP/1.1" 202 Accepted INFO: 127.0.0.1:57447 - "POST /messages/?session_id=c5b6fc01f8dc4b5e80e38eb1c1b22a9b HTTP/1.1" 202 Accepted INFO: 127.0.0.1:57447 - "POST /messages/?session_id=c5b6fc01f8dc4b5e80e38eb1c1b22a9b HTTP/1.1" 202 Accepted INFO 2025-03-14 21:15:16,155 mcp.server.lowlevel.server:535 uncategorized: Processing request of type ListToolsRequest PASSED =============================================== 1 passed, 4 warnings in 12.17s ================================================ ``` --------- Signed-off-by: Paolo Dettori --- .../distribution/routers/routing_tables.py | 4 +- .../tool_runtime/test_registration.py | 124 ++++++++++++++++++ 2 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 tests/integration/tool_runtime/test_registration.py diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 557330df7..f6adae49d 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -608,8 +608,8 @@ class ToolGroupsRoutingTable(CommonRoutingTableImpl, ToolGroups): tool_group = await self.get_tool_group(toolgroup_id) if tool_group is None: raise ValueError(f"Tool group {toolgroup_id} not found") - tools = (await self.list_tools(toolgroup_id)).data - for tool in tools: + tools = await self.list_tools(toolgroup_id) + for tool in getattr(tools, "data", []): await self.unregister_object(tool) await self.unregister_object(tool_group) diff --git a/tests/integration/tool_runtime/test_registration.py b/tests/integration/tool_runtime/test_registration.py new file mode 100644 index 000000000..e04b56652 --- /dev/null +++ b/tests/integration/tool_runtime/test_registration.py @@ -0,0 +1,124 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import socket +import threading +import time + +import httpx +import mcp.types as types +import pytest +import uvicorn +from llama_stack_client.types.shared_params.url import URL +from mcp.server.fastmcp import Context, FastMCP +from mcp.server.sse import SseServerTransport +from starlette.applications import Starlette +from starlette.routing import Mount, Route + + +@pytest.fixture(scope="module") +def mcp_server(): + server = FastMCP("FastMCP Test Server") + + @server.tool() + async def fetch(url: str, ctx: Context) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: + headers = {"User-Agent": "MCP Test Server (github.com/modelcontextprotocol/python-sdk)"} + async with httpx.AsyncClient(follow_redirects=True, headers=headers) as client: + response = await client.get(url) + response.raise_for_status() + return [types.TextContent(type="text", text=response.text)] + + sse = SseServerTransport("/messages/") + + async def handle_sse(request): + async with sse.connect_sse(request.scope, request.receive, request._send) as streams: + await server._mcp_server.run( + streams[0], + streams[1], + server._mcp_server.create_initialization_options(), + ) + + app = Starlette( + debug=True, + routes=[ + Route("/sse", endpoint=handle_sse), + Mount("/messages/", app=sse.handle_post_message), + ], + ) + + def get_open_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", 0)) + return sock.getsockname()[1] + + port = get_open_port() + + def run_server(): + uvicorn.run(app, host="0.0.0.0", port=port) + + # Start the server in a new thread + server_thread = threading.Thread(target=run_server, daemon=True) + server_thread.start() + + # Polling until the server is ready + timeout = 10 + start_time = time.time() + + while time.time() - start_time < timeout: + try: + response = httpx.get(f"http://localhost:{port}/sse") + if response.status_code == 200: + break + except (httpx.RequestError, httpx.HTTPStatusError): + pass + time.sleep(0.1) + + yield port + + +def test_register_and_unregister_toolgroup(llama_stack_client, mcp_server): + """ + Integration test for registering and unregistering a toolgroup using the ToolGroups API. + """ + port = mcp_server + test_toolgroup_id = "remote::web-fetch" + provider_id = "model-context-protocol" + + # Cleanup before running the test + toolgroups = llama_stack_client.toolgroups.list() + for toolgroup in toolgroups: + if toolgroup.identifier == test_toolgroup_id: + llama_stack_client.toolgroups.unregister(toolgroup_id=test_toolgroup_id) + + # Register the toolgroup + llama_stack_client.toolgroups.register( + toolgroup_id=test_toolgroup_id, + provider_id=provider_id, + mcp_endpoint=URL(uri=f"http://localhost:{port}/sse"), + ) + + # Verify registration + registered_toolgroup = llama_stack_client.toolgroups.get(toolgroup_id=test_toolgroup_id) + assert registered_toolgroup is not None + assert registered_toolgroup.identifier == test_toolgroup_id + assert registered_toolgroup.provider_id == provider_id + + # Verify tools listing + tools_list_response = llama_stack_client.tools.list(toolgroup_id=test_toolgroup_id) + assert isinstance(tools_list_response, list) + assert tools_list_response + + # Unregister the toolgroup + llama_stack_client.toolgroups.unregister(toolgroup_id=test_toolgroup_id) + + # Verify it is unregistered + with pytest.raises(ValueError, match=f"Tool group '{test_toolgroup_id}' not found"): + llama_stack_client.toolgroups.get(toolgroup_id=test_toolgroup_id) + + # Verify tools are also unregistered + unregister_tools_list_response = llama_stack_client.tools.list(toolgroup_id=test_toolgroup_id) + assert isinstance(unregister_tools_list_response, list) + assert not unregister_tools_list_response From 30b49d8dfa26fa0c07f39b6e7ce59d207fbcea82 Mon Sep 17 00:00:00 2001 From: Jaland Date: Wed, 9 Apr 2025 10:45:15 +0100 Subject: [PATCH 22/24] fix: Playground Container Issue (#1868) **What does this PR do?** This PR fixes a build issue with the Containerfile caused by missing requirement `llama-stack`. It updates the Containerfile to include the necessary requirements and upgrades the Python version to ensure successful builds. **Test Plan** The updated Containerfile has been tested, and the build now completes successfully with the required dependencies included. --- llama_stack/distribution/ui/Containerfile | 2 +- llama_stack/distribution/ui/requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llama_stack/distribution/ui/Containerfile b/llama_stack/distribution/ui/Containerfile index a97f25753..0126d1867 100644 --- a/llama_stack/distribution/ui/Containerfile +++ b/llama_stack/distribution/ui/Containerfile @@ -1,7 +1,7 @@ # More info on playground configuration can be found here: # https://llama-stack.readthedocs.io/en/latest/playground -FROM python:3.9-slim +FROM python:3.12-slim WORKDIR /app COPY . /app/ RUN /usr/local/bin/python -m pip install --upgrade pip && \ diff --git a/llama_stack/distribution/ui/requirements.txt b/llama_stack/distribution/ui/requirements.txt index 39f2b3d27..1e0456267 100644 --- a/llama_stack/distribution/ui/requirements.txt +++ b/llama_stack/distribution/ui/requirements.txt @@ -2,3 +2,4 @@ streamlit pandas llama-stack-client>=0.0.55 streamlit-option-menu +llama-stack>=0.1.9 From 96571053049e016b3509187ad3b00ce4fa86dc72 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Wed, 9 Apr 2025 09:26:52 -0400 Subject: [PATCH 23/24] feat: Add tools page to playground (#1904) # What does this PR do? This PR adds an additional page to the playground called "Tools". This page connects to a llama-stack server and lists all the available LLM models, builtin tools and MCP tools in the sidebar. Users can select whatever combination of model and tools they want from the sidebar for their agent. Once the selections are made, users can chat with their agent similarly to the RAG page and test out agent tool use. closes #1902 ## Test Plan Ran the following commands with a llama-stack server and the updated playground worked as expected. ``` export LLAMA_STACK_ENDPOINT="http://localhost:8321" streamlit run llama_stack/distribution/ui/app.py ``` [//]: # (## Documentation) Signed-off-by: Michael Clifford --- llama_stack/distribution/ui/app.py | 2 + .../distribution/ui/page/playground/tools.py | 116 ++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 llama_stack/distribution/ui/page/playground/tools.py diff --git a/llama_stack/distribution/ui/app.py b/llama_stack/distribution/ui/app.py index 045b07982..441f65d20 100644 --- a/llama_stack/distribution/ui/app.py +++ b/llama_stack/distribution/ui/app.py @@ -24,6 +24,7 @@ def main(): # Playground pages chat_page = st.Page("page/playground/chat.py", title="Chat", icon="💬", default=True) rag_page = st.Page("page/playground/rag.py", title="RAG", icon="💬", default=False) + tool_page = st.Page("page/playground/tools.py", title="Tools", icon="🛠", default=False) # Distribution pages resources_page = st.Page("page/distribution/resources.py", title="Resources", icon="🔍", default=False) @@ -39,6 +40,7 @@ def main(): "Playground": [ chat_page, rag_page, + tool_page, application_evaluation_page, native_evaluation_page, ], diff --git a/llama_stack/distribution/ui/page/playground/tools.py b/llama_stack/distribution/ui/page/playground/tools.py new file mode 100644 index 000000000..e987f617b --- /dev/null +++ b/llama_stack/distribution/ui/page/playground/tools.py @@ -0,0 +1,116 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import uuid + +import streamlit as st +from llama_stack_client import Agent + +from llama_stack.distribution.ui.modules.api import llama_stack_api + + +def tool_chat_page(): + st.title("🛠 Tools") + + client = llama_stack_api.client + models = client.models.list() + model_list = [model.identifier for model in models if model.api_model_type == "llm"] + + tool_groups = client.toolgroups.list() + tool_groups_list = [tool_group.identifier for tool_group in tool_groups] + mcp_tools_list = [tool for tool in tool_groups_list if tool.startswith("mcp::")] + builtin_tools_list = [tool for tool in tool_groups_list if not tool.startswith("mcp::")] + + def reset_agent(): + st.session_state.clear() + st.cache_resource.clear() + + with st.sidebar: + st.subheader("Model") + model = st.selectbox(label="models", options=model_list, on_change=reset_agent) + + st.subheader("Builtin Tools") + toolgroup_selection = st.pills( + label="Available ToolGroups", options=builtin_tools_list, selection_mode="multi", on_change=reset_agent + ) + + st.subheader("MCP Servers") + mcp_selection = st.pills( + label="Available MCP Servers", options=mcp_tools_list, selection_mode="multi", on_change=reset_agent + ) + + toolgroup_selection.extend(mcp_selection) + + active_tool_list = [] + for toolgroup_id in toolgroup_selection: + active_tool_list.extend( + [ + f"{''.join(toolgroup_id.split('::')[1:])}:{t.identifier}" + for t in client.tools.list(toolgroup_id=toolgroup_id) + ] + ) + + st.subheader(f"Active Tools: 🛠 {len(active_tool_list)}") + st.json(active_tool_list) + + @st.cache_resource + def create_agent(): + return Agent( + client, + model=model, + instructions="You are a helpful assistant. When you use a tool always respond with a summary of the result.", + tools=toolgroup_selection, + sampling_params={ + "strategy": {"type": "greedy"}, + }, + ) + + agent = create_agent() + + if "agent_session_id" not in st.session_state: + st.session_state["agent_session_id"] = agent.create_session(session_name=f"tool_demo_{uuid.uuid4()}") + + session_id = st.session_state["agent_session_id"] + + if "messages" not in st.session_state: + st.session_state["messages"] = [{"role": "assistant", "content": "How can I help you?"}] + + for msg in st.session_state.messages: + with st.chat_message(msg["role"]): + st.markdown(msg["content"]) + + if prompt := st.chat_input(placeholder=""): + with st.chat_message("user"): + st.markdown(prompt) + + st.session_state.messages.append({"role": "user", "content": prompt}) + + turn_response = agent.create_turn( + session_id=session_id, + messages=[{"role": "user", "content": prompt}], + stream=True, + ) + + def response_generator(turn_response): + for response in turn_response: + if hasattr(response.event, "payload"): + print(response.event.payload) + if response.event.payload.event_type == "step_progress": + if hasattr(response.event.payload.delta, "text"): + yield response.event.payload.delta.text + if response.event.payload.event_type == "step_complete": + if response.event.payload.step_details.step_type == "tool_execution": + yield " 🛠 " + else: + yield f"Error occurred in the Llama Stack Cluster: {response}" + + with st.chat_message("assistant"): + response = st.write_stream(response_generator(turn_response)) + + st.session_state.messages.append({"role": "assistant", "content": response}) + + +tool_chat_page() From 692f56068c24c42fe4a5543aec04a3f7f9bd3925 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Wed, 9 Apr 2025 09:34:41 -0400 Subject: [PATCH 24/24] docs: Add recent release notes (#1899) # What does this PR do? These are missing and changelog doc automation is not working yet due to missing permissions for GitHub Actions: https://dev.to/suzuki0430/how-to-enable-the-allow-github-actions-to-create-and-approve-pull-requests-option-when-its-grayed-out-3e1i --------- Signed-off-by: Yuan Tang --- CHANGELOG.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 953d04def..5086094ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,42 @@ # Changelog +# v0.2.1 +Published on: 2025-04-05T23:13:00Z + + + +--- + +# v0.2.0 +Published on: 2025-04-05T19:04:29Z + +## Llama 4 Support + +Checkout more at https://www.llama.com + + + +--- + +# v0.1.9 +Published on: 2025-03-29T00:52:23Z + +### Build and Test Agents +* Agents: Entire document context with attachments +* RAG: Documentation with sqlite-vec faiss comparison +* Getting started: Fixes to getting started notebook. + +### Agent Evals and Model Customization +* (**New**) Post-training: Add nemo customizer + +### Better Engineering +* Moved sqlite-vec to non-blocking calls +* Don't return a payload on file delete + + + +--- + # v0.1.8 Published on: 2025-03-24T01:28:50Z