Merge branch 'main' into support-nvidia-hosted-vision-models

This commit is contained in:
Matthew Farrellee 2025-03-10 11:08:34 -04:00
commit 6f16396447
381 changed files with 189108 additions and 9338 deletions

View file

@ -0,0 +1,9 @@
---
description: General rules always applicable across the project
globs:
alwaysApply: true
---
# Style
- Comments must add value to code. Don't write filler comments explaining what you are doing next; they just add noise.
- Add a comment to clarify surprising behavior which would not be obvious. Good variable naming and clear code organization is more important.

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in # These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence, # the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan * @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722

8
.github/dependabot.yml vendored Normal file
View file

@ -0,0 +1,8 @@
# GitHub Dependabot configuration
version: 2
updates:
# Enable version updates for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/" # Will use the default workflow location of `.github/workflows`
schedule:
interval: "daily"

View file

@ -310,7 +310,7 @@ jobs:
- name: "PR - Upload Test Summary" - name: "PR - Upload Test Summary"
id: pr_test_summary_upload id: pr_test_summary_upload
if: github.event_name == 'pull_request_target' if: github.event_name == 'pull_request_target'
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
name: test-summary name: test-summary
path: test-summary.md path: test-summary.md
@ -320,7 +320,7 @@ jobs:
- name: "PR - Update comment" - name: "PR - Update comment"
id: pr_update_comment id: pr_update_comment
if: github.event_name == 'pull_request_target' if: github.event_name == 'pull_request_target'
uses: thollander/actions-comment-pull-request@v2 uses: thollander/actions-comment-pull-request@v3
with: with:
filePath: test-summary.md filePath: test-summary.md

36
.github/workflows/unit-tests.yml vendored Normal file
View file

@ -0,0 +1,36 @@
name: Unit Tests
on:
pull_request:
branches: [ main ]
workflow_dispatch:
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10.16'
- uses: astral-sh/setup-uv@v5
with:
python-version: '3.10.16'
enable-cache: false
- name: Run unit tests
run: |
uv run -p 3.10.16 --with . --with ".[dev]" --with ".[test]" pytest -s -v tests/unit/ --junitxml=pytest-report.xml
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: test-results
path: |
.pytest_cache/
pytest-report.xml
retention-days: 7

View file

@ -12,12 +12,14 @@ on:
- main - main
paths: paths:
- 'docs/**' - 'docs/**'
- 'pyproject.toml'
- '.github/workflows/update-readthedocs.yml' - '.github/workflows/update-readthedocs.yml'
pull_request: pull_request:
branches: branches:
- main - main
paths: paths:
- 'docs/**' - 'docs/**'
- 'pyproject.toml'
- '.github/workflows/update-readthedocs.yml' - '.github/workflows/update-readthedocs.yml'
jobs: jobs:

1
.gitignore vendored
View file

@ -20,3 +20,4 @@ _build
docs/src docs/src
pyrightconfig.json pyrightconfig.json
venv/ venv/
pytest-report.xml

0
.gitmodules vendored
View file

View file

@ -8,15 +8,13 @@ repos:
rev: v5.0.0 # Latest stable version rev: v5.0.0 # Latest stable version
hooks: hooks:
- id: check-merge-conflict - id: check-merge-conflict
- id: trailing-whitespace
exclude: '\.py$' # Exclude Python files as Ruff already handles them
- id: check-added-large-files - id: check-added-large-files
args: ['--maxkb=1000'] args: ['--maxkb=1000']
- id: end-of-file-fixer - id: end-of-file-fixer
exclude: '^(.*\.svg)$' exclude: '^(.*\.svg)$'
# Temporarily disabling this
# - id: no-commit-to-branch
# args: ['--branch=main']
- repo: https://github.com/Lucas-C/pre-commit-hooks - repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.4 rev: v1.5.4
hooks: hooks:
@ -42,8 +40,9 @@ repos:
- black==24.3.0 - black==24.3.0
- repo: https://github.com/astral-sh/uv-pre-commit - repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.5.26 rev: 0.6.3
hooks: hooks:
- id: uv-lock
- id: uv-export - id: uv-export
args: [ args: [
"--frozen", "--frozen",
@ -51,8 +50,6 @@ repos:
"--no-emit-project", "--no-emit-project",
"--output-file=requirements.txt" "--output-file=requirements.txt"
] ]
files: ^pyproject\.toml$
- id: uv-sync
- repo: https://github.com/pre-commit/mirrors-mypy - repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.15.0 rev: v1.15.0
@ -67,12 +64,6 @@ repos:
- pydantic - pydantic
pass_filenames: false pass_filenames: false
# - repo: https://github.com/jsh9/pydoclint
# rev: d88180a8632bb1602a4d81344085cf320f288c5a
# hooks:
# - id: pydoclint
# args: [--config=pyproject.toml]
# - repo: https://github.com/tcort/markdown-link-check # - repo: https://github.com/tcort/markdown-link-check
# rev: v3.11.2 # rev: v3.11.2
# hooks: # hooks:
@ -84,10 +75,8 @@ repos:
- id: distro-codegen - id: distro-codegen
name: Distribution Template Codegen name: Distribution Template Codegen
additional_dependencies: additional_dependencies:
- rich
- pydantic
- uv==0.6.0 - uv==0.6.0
entry: uv run python -m llama_stack.scripts.distro_codegen entry: uv run --extra codegen python -m llama_stack.scripts.distro_codegen
language: python language: python
pass_filenames: false pass_filenames: false
require_serial: true require_serial: true

304
CHANGELOG.md Normal file
View file

@ -0,0 +1,304 @@
# Changelog
# v0.1.6
Published on: 2025-03-08T04:35:08Z
## 0.1.6 Release Notes
### Build and Test Agents
* Inference: Fixed support for inline vllm provider
* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
* Agent: Unify tools and Python SDK Agents API
* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
* Agent: Support python functions without @client_tool decorator as client tools
* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
* VectorIO: MilvusDB support added
### Agent Evals and Model Customization
* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
* Eval: Documentation for eval, scoring, adding new benchmarks
* Eval: Distribution template to run benchmarks on llama & non-llama models
* Eval: Ability to register new custom LLM-as-judge scoring functions
* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
### Deploy and Monitoring of Agents
* Better support for different log levels across all components for better monitoring
### Better Engineering
* Enhance OpenAPI spec to include Error types across all APIs
* Moved all tests to /tests and created unit tests to run on each PR
* Removed all dependencies on llama-models repo
---
# v0.1.5.1
Published on: 2025-02-28T22:37:44Z
## 0.1.5.1 Release Notes
* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
---
# v0.1.5
Published on: 2025-02-28T18:14:01Z
## 0.1.5 Release Notes
### Build Agents
* Inference: Support more non-llama models (openai, anthropic, gemini)
* Inference: Can use the provider's model name in addition to the HF alias
* Inference: Fixed issues with calling tools that weren't specified in the prompt
* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
* Embeddings: Added support for Nemo retriever embedding models
* Tools: Added support for MCP tools in Ollama Distribution
* Distributions: Added new Groq distribution
### Customize Models
* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
### Monitor agents
* More comprehensive logging of agent steps including client tools
* Telemetry inputs/outputs are now structured and queryable
* Ability to retrieve agents session, turn, step by ids
### Better Engineering
* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
* Move most logging to use logger instead of prints
* Completed text /chat-completion and /completion tests
---
# v0.1.4
Published on: 2025-02-25T00:02:43Z
## v0.1.4 Release Notes
Here are the key changes coming as part of this release:
### Build and Test Agents
* Inference: Added support for non-llama models
* Inference: Added option to list all downloaded models and remove models
* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
* Agent: Added logging for agent step start and completion times
* Agent: Added support for logging for tool execution metadata
* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
* VectorIO: Improved performance of sqlite-vec using chunked writes
### Agent Evals and Model Customization
* Deprecated api /eval-tasks. Use /eval/benchmark instead
* Added CPU training support for TorchTune
### Deploy and Monitoring of Agents
* Consistent view of client and server tool calls in telemetry
### Better Engineering
* Made tests more data-driven for consistent evaluation
* Fixed documentation links and improved API reference generation
* Various small fixes for build scripts and system reliability
---
# v0.1.3
Published on: 2025-02-14T20:24:32Z
## v0.1.3 Release
Here are some key changes that are coming as part of this release.
### Build and Test Agents
Streamlined the initial development experience
- Added support for llama stack run --image-type venv
- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
- vLLM improvements for tool calling and logprobs
- Better handling of sporadic code_interpreter tool calls
### Agent Evals
Better benchmarking and Agent performance assessment
- Renamed eval API /eval-task to /benchmarks
- Improved documentation and notebooks for RAG and evals
### Deploy and Monitoring of Agents
Improved production readiness
- Added usage metrics collection for chat completions
- CLI improvements for provider information
- Improved error handling and system reliability
- Better model endpoint handling and accessibility
- Improved signal handling on distro server
### Better Engineering
Infrastructure and code quality improvements
- Faster text-based chat completion tests
- Improved testing for non-streaming agent apis
- Standardized import formatting with ruff linter
- Added conventional commits standard
- Fixed documentation parsing issues
---
# v0.1.2
Published on: 2025-02-07T22:06:49Z
# TL;DR
- Several stabilizations to development flows after the switch to `uv`
- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
- Added automated rebuilds for ReadTheDocs
- Llama Stack server supports HTTPS
- Added system prompt overrides support
- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
---
# v0.1.1
Published on: 2025-02-02T02:29:24Z
A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
---
# v0.1.0
Published on: 2025-01-24T17:47:47Z
We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
## Context
GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stacks plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
## Release
After iterating on the APIs for the last 3 months, today were launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
There are example standalone apps in llama-stack-apps.
## Key Features of this release
- **Unified API Layer**
- Inference: Run LLM models
- RAG: Store and retrieve knowledge for RAG
- Agents: Build multi-step agentic workflows
- Tools: Register tools that can be called by the agent
- Safety: Apply content filtering and safety policies
- Evaluation: Test model and agent quality
- Telemetry: Collect and analyze usage data and complex agentic traces
- Post Training ( Coming Soon ): Fine tune models for specific use cases
- **Rich Provider Ecosystem**
- Local Development: Meta's Reference, Ollama
- Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
- On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
- On-device: iOS and Android support
- **Built for Production**
- Pre-packaged distributions for common deployment scenarios
- Backwards compatibility across model versions
- Comprehensive evaluation capabilities
- Full observability and monitoring
- **Multiple developer interfaces**
- CLI: Command line interface
- Python SDK
- Swift iOS SDK
- Kotlin Android SDK
- **Sample llama stack applications**
- Python
- iOS
- Android
---
# v0.1.0rc12
Published on: 2025-01-22T22:24:01Z
---
# v0.0.63
Published on: 2024-12-18T07:17:43Z
A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
---
# v0.0.62
Published on: 2024-12-18T02:39:43Z
---
# v0.0.61
Published on: 2024-12-10T20:50:33Z
---
# v0.0.55
Published on: 2024-11-23T17:14:07Z
---
# v0.0.54
Published on: 2024-11-22T00:36:09Z
---
# v0.0.53
Published on: 2024-11-20T22:18:00Z
🚀 Initial Release Notes for Llama Stack!
### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
- Persistence for registered objects with distribution
- Ability to persist memory banks created for FAISS
- PostgreSQL KVStore implementation
- Environment variable placeholder support in run.yaml files
- Comprehensive Zero-to-Hero notebooks and quickstart guides
- Support for quantized models in Ollama
- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
- Bedrock distribution with safety shields support
- Evals API with task registration and scoring functions
- MMLU and SimpleQA benchmark scoring functions
- Huggingface dataset provider integration for benchmarks
- Support for custom dataset registration from local paths
- Benchmark evaluation CLI tools with visualization tables
- RAG evaluation scoring functions and metrics
- Local persistence for datasets and eval tasks
### Changed
- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
- Changed provider naming convention (`impls``inline`, `adapters``remote`)
- Updated API signatures for dataset and eval task registration
- Restructured folder organization for providers
- Enhanced Docker build configuration
- Added version prefixing for REST API routes
- Enhanced evaluation task registration workflow
- Improved benchmark evaluation output formatting
- Restructured evals folder organization for better modularity
### Removed
- `llama stack configure` command
---

View file

@ -64,10 +64,23 @@ You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting
You can install the dependencies by running: You can install the dependencies by running:
```bash ```bash
$ cd llama-stack cd llama-stack
$ uv sync --extra dev uv sync --extra dev
$ uv pip install -e . uv pip install -e .
$ source .venv/bin/activate source .venv/bin/activate
```
Note that you can create a dotenv file `.env` that includes necessary environment variables:
```
LLAMA_STACK_BASE_URL=http://localhost:8321
LLAMA_STACK_CLIENT_LOG=debug
LLAMA_STACK_PORT=8321
LLAMA_STACK_CONFIG=
```
And then use this dotenv file when running client SDK tests via the following:
```bash
uv run --env-file .env -- pytest -v tests/api/inference/test_text_inference.py
``` ```
## Pre-commit Hooks ## Pre-commit Hooks
@ -75,7 +88,7 @@ $ source .venv/bin/activate
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running: We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash ```bash
$ uv run pre-commit install uv run pre-commit install
``` ```
After that, pre-commit hooks will run automatically before each commit. After that, pre-commit hooks will run automatically before each commit.
@ -83,7 +96,7 @@ After that, pre-commit hooks will run automatically before each commit.
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running: Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
```bash ```bash
$ uv run pre-commit run --all-files uv run pre-commit run --all-files
``` ```
> [!CAUTION] > [!CAUTION]
@ -94,8 +107,8 @@ $ uv run pre-commit run --all-files
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run: To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
```bash ```bash
$ uv add foo uv add foo
$ uv sync uv sync
``` ```
## Coding Style ## Coding Style
@ -110,15 +123,15 @@ Some tips about common tasks you work on while contributing to Llama Stack:
### Using `llama stack build` ### Using `llama stack build`
Building a stack image (conda / docker) will use the production version of the `llama-stack`, `llama-models` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_MODELS_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands. Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
Example: Example:
```bash ```bash
$ cd work/ cd work/
$ git clone https://github.com/meta-llama/llama-stack.git git clone https://github.com/meta-llama/llama-stack.git
$ git clone https://github.com/meta-llama/llama-models.git git clone https://github.com/meta-llama/llama-stack-client-python.git
$ cd llama-stack cd llama-stack
$ LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack build --template <...> LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
``` ```
@ -131,14 +144,14 @@ If you have made changes to a provider's configuration in any form (introducing
If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme. If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
```bash ```bash
$ cd llama-stack/docs cd llama-stack/docs
$ uv sync --extra docs uv sync --extra docs
# This rebuilds the documentation pages. # This rebuilds the documentation pages.
$ uv run make html uv run make html
# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation. # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
$ uv run sphinx-autobuild source build/html --write-all uv run sphinx-autobuild source build/html --write-all
``` ```
### Update API Documentation ### Update API Documentation
@ -146,8 +159,8 @@ $ uv run sphinx-autobuild source build/html --write-all
If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command: If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
```bash ```bash
$ uv sync --extra dev uv sync --extra dev
$ uv run ./docs/openapi_generator/run_openapi_generator.sh uv run ./docs/openapi_generator/run_openapi_generator.sh
``` ```
The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing. The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.

View file

@ -1,6 +1,8 @@
include pyproject.toml include pyproject.toml
include distributions/dependencies.json include distributions/dependencies.json
include llama_stack/models/llama/llama3/tokenizer.model
include llama_stack/distribution/*.sh include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh include llama_stack/cli/scripts/*.sh
include llama_stack/templates/*/*.yaml include llama_stack/templates/*/*.yaml
include llama_stack/providers/tests/test_cases/inference/*.json include llama_stack/providers/tests/test_cases/inference/*.json
include llama_stack/models/llama/*/*.md

View file

@ -32,7 +32,7 @@ Llama Stack standardizes the core building blocks that simplify AI application d
By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications. By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.
### API Providers ### API Providers
Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack. Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | | **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:| |:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|

View file

@ -216,8 +216,8 @@
"faiss-cpu", "faiss-cpu",
"fastapi", "fastapi",
"fire", "fire",
"groq",
"httpx", "httpx",
"litellm",
"matplotlib", "matplotlib",
"nltk", "nltk",
"numpy", "numpy",
@ -431,6 +431,7 @@
"fire", "fire",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"ollama", "ollama",

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

View file

@ -3,6 +3,8 @@
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Alpha_Llama_Stack_Post_Training.ipynb)\n",
"\n",
"# [Alpha] Llama Stack Post Training\n", "# [Alpha] Llama Stack Post Training\n",
"This notebook will use a real world problem (improve LLM as tax preparer) to walk through the main sets of APIs we offer with Llama stack for post training to improve the LLM performance for agentic apps (We support supervised finetune now, RLHF and knowledge distillation will come soon!).\n", "This notebook will use a real world problem (improve LLM as tax preparer) to walk through the main sets of APIs we offer with Llama stack for post training to improve the LLM performance for agentic apps (We support supervised finetune now, RLHF and knowledge distillation will come soon!).\n",
"\n", "\n",
@ -64,7 +66,7 @@
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": [ "text": [
"Collecting git+https://github.com/meta-llama/llama-stack.git@hf_format_checkpointer\n", "Collecting git+https://github.com/meta-llama/llama-stack.git\n",
" Cloning https://github.com/meta-llama/llama-stack.git (to revision hf_format_checkpointer) to /tmp/pip-req-build-j_1bxqzm\n", " Cloning https://github.com/meta-llama/llama-stack.git (to revision hf_format_checkpointer) to /tmp/pip-req-build-j_1bxqzm\n",
" Running command git clone --filter=blob:none --quiet https://github.com/meta-llama/llama-stack.git /tmp/pip-req-build-j_1bxqzm\n", " Running command git clone --filter=blob:none --quiet https://github.com/meta-llama/llama-stack.git /tmp/pip-req-build-j_1bxqzm\n",
" Running command git checkout -b hf_format_checkpointer --track origin/hf_format_checkpointer\n", " Running command git checkout -b hf_format_checkpointer --track origin/hf_format_checkpointer\n",
@ -76,7 +78,7 @@
} }
], ],
"source": [ "source": [
"!pip install git+https://github.com/meta-llama/llama-stack.git@hf_format_checkpointer" "!pip install git+https://github.com/meta-llama/llama-stack.git #TODO: update this after the next pkg release"
] ]
}, },
{ {
@ -3673,7 +3675,7 @@
" benchmark_id=\"llama3.2-3B-instruct:tax_eval\",\n", " benchmark_id=\"llama3.2-3B-instruct:tax_eval\",\n",
" input_rows=eval_rows.rows,\n", " input_rows=eval_rows.rows,\n",
" scoring_functions=[\"braintrust::answer-similarity\"],\n", " scoring_functions=[\"braintrust::answer-similarity\"],\n",
" task_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n", " \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",
@ -6381,7 +6383,7 @@
" benchmark_id=\"Llama-3.2-3B-Instruct-sft-0:tax_eval\",\n", " benchmark_id=\"Llama-3.2-3B-Instruct-sft-0:tax_eval\",\n",
" input_rows=eval_rows.rows,\n", " input_rows=eval_rows.rows,\n",
" scoring_functions=[\"braintrust::answer-similarity\"],\n", " scoring_functions=[\"braintrust::answer-similarity\"],\n",
" task_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n", " \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",

File diff suppressed because it is too large Load diff

View file

@ -45,65 +45,7 @@
"id": "O9pGVlPIjpix", "id": "O9pGVlPIjpix",
"outputId": "e1fbe723-ae31-4630-eb80-4c4f6476d56f" "outputId": "e1fbe723-ae31-4630-eb80-4c4f6476d56f"
}, },
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: llama-stack in /usr/local/lib/python3.10/dist-packages (0.0.61)\n",
"Requirement already satisfied: blobfile in /usr/local/lib/python3.10/dist-packages (from llama-stack) (3.0.0)\n",
"Requirement already satisfied: fire in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.7.0)\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.28.1)\n",
"Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.26.5)\n",
"Requirement already satisfied: llama-models>=0.0.61 in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.0.61)\n",
"Requirement already satisfied: llama-stack-client>=0.0.61 in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.0.61)\n",
"Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.10/dist-packages (from llama-stack) (3.0.48)\n",
"Requirement already satisfied: python-dotenv in /usr/local/lib/python3.10/dist-packages (from llama-stack) (1.0.1)\n",
"Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.10/dist-packages (from llama-stack) (2.10.3)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from llama-stack) (2.32.3)\n",
"Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from llama-stack) (13.9.4)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from llama-stack) (75.1.0)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from llama-stack) (2.5.0)\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (6.0.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (3.1.4)\n",
"Requirement already satisfied: tiktoken in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (0.8.0)\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (10.4.0)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (3.7.1)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (8.1.7)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (1.9.0)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (2.2.2)\n",
"Requirement already satisfied: pyaml in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (24.12.1)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (1.3.1)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (4.66.6)\n",
"Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (4.12.2)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx->llama-stack) (2024.8.30)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx->llama-stack) (1.0.7)\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx->llama-stack) (3.10)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx->llama-stack) (0.14.0)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2->llama-stack) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2->llama-stack) (2.27.1)\n",
"Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (3.21.0)\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (2.2.3)\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (5.3.0)\n",
"Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (3.16.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->llama-stack) (2024.9.0)\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->llama-stack) (24.2)\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit->llama-stack) (0.2.13)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->llama-stack) (3.4.0)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->llama-stack) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->llama-stack) (2.18.0)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->llama-stack-client>=0.0.61->llama-stack) (1.2.2)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack) (0.1.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->llama-models>=0.0.61->llama-stack) (3.0.2)\n",
"Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (2024.2)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken->llama-models>=0.0.61->llama-stack) (2024.9.11)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client>=0.0.61->llama-stack) (1.17.0)\n"
]
}
],
"source": [ "source": [
"# NBVAL_SKIP\n", "# NBVAL_SKIP\n",
"!pip install -U llama-stack" "!pip install -U llama-stack"
@ -120,198 +62,10 @@
"id": "JQpLUSNjlGAM", "id": "JQpLUSNjlGAM",
"outputId": "2f7fec97-5511-4cae-d51e-6d262fbca19c" "outputId": "2f7fec97-5511-4cae-d51e-6d262fbca19c"
}, },
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: llama-stack in /usr/local/lib/python3.10/dist-packages (0.0.61)\r\n",
"Requirement already satisfied: blobfile in /usr/local/lib/python3.10/dist-packages (from llama-stack) (3.0.0)\r\n",
"Requirement already satisfied: fire in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.7.0)\r\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.28.1)\r\n",
"Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.26.5)\r\n",
"Requirement already satisfied: llama-models>=0.0.61 in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.0.61)\r\n",
"Requirement already satisfied: llama-stack-client>=0.0.61 in /usr/local/lib/python3.10/dist-packages (from llama-stack) (0.0.61)\r\n",
"Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.10/dist-packages (from llama-stack) (3.0.48)\r\n",
"Requirement already satisfied: python-dotenv in /usr/local/lib/python3.10/dist-packages (from llama-stack) (1.0.1)\r\n",
"Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.10/dist-packages (from llama-stack) (2.10.3)\r\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from llama-stack) (2.32.3)\r\n",
"Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from llama-stack) (13.9.4)\r\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from llama-stack) (75.1.0)\r\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from llama-stack) (2.5.0)\r\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (6.0.2)\r\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (3.1.4)\r\n",
"Requirement already satisfied: tiktoken in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (0.8.0)\r\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from llama-models>=0.0.61->llama-stack) (10.4.0)\r\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (3.7.1)\r\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (8.1.7)\r\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (1.9.0)\r\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (2.2.2)\r\n",
"Requirement already satisfied: pyaml in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (24.12.1)\r\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (1.3.1)\r\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (4.66.6)\r\n",
"Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.10/dist-packages (from llama-stack-client>=0.0.61->llama-stack) (4.12.2)\r\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx->llama-stack) (2024.8.30)\r\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx->llama-stack) (1.0.7)\r\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx->llama-stack) (3.10)\r\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx->llama-stack) (0.14.0)\r\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2->llama-stack) (0.7.0)\r\n",
"Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2->llama-stack) (2.27.1)\r\n",
"Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (3.21.0)\r\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (2.2.3)\r\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (5.3.0)\r\n",
"Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.10/dist-packages (from blobfile->llama-stack) (3.16.1)\r\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->llama-stack) (2024.9.0)\r\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->llama-stack) (24.2)\r\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit->llama-stack) (0.2.13)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->llama-stack) (3.4.0)\r\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->llama-stack) (3.0.0)\r\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->llama-stack) (2.18.0)\r\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->llama-stack-client>=0.0.61->llama-stack) (1.2.2)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack) (0.1.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->llama-models>=0.0.61->llama-stack) (3.0.2)\n",
"Requirement already satisfied: numpy>=1.22.4 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->llama-stack-client>=0.0.61->llama-stack) (2024.2)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken->llama-models>=0.0.61->llama-stack) (2024.9.11)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client>=0.0.61->llama-stack) (1.17.0)\n",
"Installing pip dependencies\n",
"Requirement already satisfied: blobfile in /usr/local/lib/python3.10/dist-packages (3.0.0)\n",
"Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (5.2.0)\n",
"Requirement already satisfied: opentelemetry-sdk in /usr/local/lib/python3.10/dist-packages (1.28.2)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (1.13.1)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (2.2.2)\n",
"Requirement already satisfied: autoevals in /usr/local/lib/python3.10/dist-packages (0.0.109)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (0.2.0)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (1.5.2)\n",
"Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (10.4.0)\n",
"Requirement already satisfied: pypdf in /usr/local/lib/python3.10/dist-packages (5.1.0)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (4.66.6)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.9.1)\n",
"Requirement already satisfied: aiosqlite in /usr/local/lib/python3.10/dist-packages (0.20.0)\n",
"Requirement already satisfied: psycopg2-binary in /usr/local/lib/python3.10/dist-packages (2.9.10)\n",
"Requirement already satisfied: faiss-cpu in /usr/local/lib/python3.10/dist-packages (1.9.0.post1)\n",
"Requirement already satisfied: opentelemetry-exporter-otlp-proto-http in /usr/local/lib/python3.10/dist-packages (1.28.2)\n",
"Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.46.3)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.26.4)\n",
"Requirement already satisfied: chromadb-client in /usr/local/lib/python3.10/dist-packages (0.5.23)\n",
"Requirement already satisfied: openai in /usr/local/lib/python3.10/dist-packages (1.54.5)\n",
"Requirement already satisfied: redis in /usr/local/lib/python3.10/dist-packages (5.2.1)\n",
"Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.2.0)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (3.8.0)\n",
"Requirement already satisfied: together in /usr/local/lib/python3.10/dist-packages (1.3.5)\n",
"Requirement already satisfied: fastapi in /usr/local/lib/python3.10/dist-packages (0.115.6)\n",
"Requirement already satisfied: fire in /usr/local/lib/python3.10/dist-packages (0.7.0)\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.10/dist-packages (0.28.1)\n",
"Requirement already satisfied: uvicorn in /usr/local/lib/python3.10/dist-packages (0.32.1)\n",
"Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.10/dist-packages (from blobfile) (3.21.0)\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.10/dist-packages (from blobfile) (2.2.3)\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.10/dist-packages (from blobfile) (5.3.0)\n",
"Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.10/dist-packages (from blobfile) (3.16.1)\n",
"Requirement already satisfied: opentelemetry-api==1.28.2 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-sdk) (1.28.2)\n",
"Requirement already satisfied: opentelemetry-semantic-conventions==0.49b2 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-sdk) (0.49b2)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-sdk) (4.12.2)\n",
"Requirement already satisfied: deprecated>=1.2.6 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-api==1.28.2->opentelemetry-sdk) (1.2.15)\n",
"Requirement already satisfied: importlib-metadata<=8.5.0,>=6.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-api==1.28.2->opentelemetry-sdk) (8.5.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: chevron in /usr/local/lib/python3.10/dist-packages (from autoevals) (0.14.0)\n",
"Requirement already satisfied: levenshtein in /usr/local/lib/python3.10/dist-packages (from autoevals) (0.26.1)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from autoevals) (6.0.2)\n",
"Requirement already satisfied: braintrust_core==0.0.54 in /usr/local/lib/python3.10/dist-packages (from autoevals) (0.0.54)\n",
"Requirement already satisfied: jsonschema in /usr/local/lib/python3.10/dist-packages (from autoevals) (4.23.0)\n",
"Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn) (3.5.0)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)\n",
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.9.11)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from faiss-cpu) (24.2)\n",
"Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-http) (1.66.0)\n",
"Requirement already satisfied: opentelemetry-exporter-otlp-proto-common==1.28.2 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-http) (1.28.2)\n",
"Requirement already satisfied: opentelemetry-proto==1.28.2 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-http) (1.28.2)\n",
"Requirement already satisfied: requests~=2.7 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-http) (2.32.3)\n",
"Requirement already satisfied: protobuf<6.0,>=5.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-proto==1.28.2->opentelemetry-exporter-otlp-proto-http) (5.29.1)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.26.5)\n",
"Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.20.3)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.5)\n",
"Requirement already satisfied: opentelemetry-exporter-otlp-proto-grpc>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from chromadb-client) (1.28.2)\n",
"Requirement already satisfied: overrides>=7.3.1 in /usr/local/lib/python3.10/dist-packages (from chromadb-client) (7.7.0)\n",
"Requirement already satisfied: posthog>=2.4.0 in /usr/local/lib/python3.10/dist-packages (from chromadb-client) (3.7.4)\n",
"Requirement already satisfied: pydantic>=1.9 in /usr/local/lib/python3.10/dist-packages (from chromadb-client) (2.10.3)\n",
"Requirement already satisfied: tenacity>=8.2.3 in /usr/local/lib/python3.10/dist-packages (from chromadb-client) (9.0.0)\n",
"Requirement already satisfied: orjson>=3.9.12 in /usr/local/lib/python3.10/dist-packages (from chromadb-client) (3.10.12)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from openai) (1.9.0)\n",
"Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from openai) (0.8.2)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.1)\n",
"Requirement already satisfied: async-timeout>=4.0.3 in /usr/local/lib/python3.10/dist-packages (from redis) (4.0.3)\n",
"Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n",
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n",
"Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n",
"Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.10)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.3.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (4.55.2)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (1.4.7)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib) (3.2.0)\n",
"Requirement already satisfied: eval-type-backport<0.3.0,>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from together) (0.2.0)\n",
"Requirement already satisfied: rich<14.0.0,>=13.8.1 in /usr/local/lib/python3.10/dist-packages (from together) (13.9.4)\n",
"Requirement already satisfied: tabulate<0.10.0,>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from together) (0.9.0)\n",
"Requirement already satisfied: typer<0.14,>=0.9 in /usr/local/lib/python3.10/dist-packages (from together) (0.13.1)\n",
"Requirement already satisfied: starlette<0.42.0,>=0.40.0 in /usr/local/lib/python3.10/dist-packages (from fastapi) (0.41.3)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from fire) (2.5.0)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx) (2024.8.30)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx) (1.0.7)\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx) (3.10)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx) (0.14.0)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.4)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.1)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.18.3)\n",
"Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.2)\n",
"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated>=1.2.6->opentelemetry-api==1.28.2->opentelemetry-sdk) (1.17.0)\n",
"Requirement already satisfied: grpcio<2.0.0,>=1.63.2 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb-client) (1.68.1)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog>=2.4.0->chromadb-client) (1.17.0)\n",
"Requirement already satisfied: monotonic>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog>=2.4.0->chromadb-client) (1.6)\n",
"Requirement already satisfied: backoff>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from posthog>=2.4.0->chromadb-client) (2.2.1)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.9->chromadb-client) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.1 in /usr/local/lib/python3.10/dist-packages (from pydantic>=1.9->chromadb-client) (2.27.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests~=2.7->opentelemetry-exporter-otlp-proto-http) (3.4.0)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14.0.0,>=13.8.1->together) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14.0.0,>=13.8.1->together) (2.18.0)\n",
"Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<0.14,>=0.9->together) (1.5.4)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema->autoevals) (2024.10.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema->autoevals) (0.35.1)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema->autoevals) (0.22.3)\n",
"Requirement already satisfied: rapidfuzz<4.0.0,>=3.9.0 in /usr/local/lib/python3.10/dist-packages (from levenshtein->autoevals) (3.10.1)\n",
"Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<=8.5.0,>=6.0->opentelemetry-api==1.28.2->opentelemetry-sdk) (3.21.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.8.1->together) (0.1.2)\n",
"sentence-transformers --no-deps\n",
"Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.10/dist-packages (3.2.1)\n",
"torch --index-url https://download.pytorch.org/whl/cpu\n",
"Looking in indexes: https://download.pytorch.org/whl/cpu\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.5.1+cu121)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.16.1)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.4.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2024.9.0)\n",
"Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch) (1.13.1)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (3.0.2)\n",
"\u001b[32mBuild Successful!\u001b[0m\n"
]
}
],
"source": [ "source": [
"# NBVAL_SKIP\n", "# NBVAL_SKIP\n",
"!llama stack build --template together --image-type venv --image-name __system__" "!UV_SYSTEM_PYTHON=1 llama stack build --template together --image-type venv"
] ]
}, },
{ {
@ -1027,7 +781,7 @@
" benchmark_id=\"meta-reference::mmmu\",\n", " benchmark_id=\"meta-reference::mmmu\",\n",
" input_rows=eval_rows,\n", " input_rows=eval_rows,\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n", " scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
" task_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n", " \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",
@ -1072,10 +826,9 @@
"_ = client.datasets.register(\n", "_ = client.datasets.register(\n",
" dataset_id=simpleqa_dataset_id,\n", " dataset_id=simpleqa_dataset_id,\n",
" provider_id=\"huggingface\",\n", " provider_id=\"huggingface\",\n",
" url={\"uri\": \"https://huggingface.co/datasets/llamastack/evals\"},\n", " url={\"uri\": \"https://huggingface.co/datasets/llamastack/simpleqa\"},\n",
" metadata={\n", " metadata={\n",
" \"path\": \"llamastack/evals\",\n", " \"path\": \"llamastack/simpleqa\",\n",
" \"name\": \"evals__simpleqa\",\n",
" \"split\": \"train\",\n", " \"split\": \"train\",\n",
" },\n", " },\n",
" dataset_schema={\n", " dataset_schema={\n",
@ -1206,7 +959,7 @@
" benchmark_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.rows,\n", " input_rows=eval_rows.rows,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" task_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n", " \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"model\",\n", " \"type\": \"model\",\n",
@ -1355,7 +1108,7 @@
" benchmark_id=\"meta-reference::simpleqa\",\n", " benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.rows,\n", " input_rows=eval_rows.rows,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n", " scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" task_config={\n", " benchmark_config={\n",
" \"type\": \"benchmark\",\n", " \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n", " \"eval_candidate\": {\n",
" \"type\": \"agent\",\n", " \"type\": \"agent\",\n",

File diff suppressed because it is too large Load diff

View file

@ -3,7 +3,7 @@ The RFC Specification (OpenAPI format) is generated from the set of API endpoint
Please install the following packages before running the script: Please install the following packages before running the script:
``` ```
pip install fire PyYAML llama-models pip install fire PyYAML
``` ```
Then simply run `sh run_openapi_generator.sh` Then simply run `sh run_openapi_generator.sh`

View file

@ -55,6 +55,7 @@ def main(output_dir: str):
a set of endpoints and their corresponding interfaces that are tailored to a set of endpoints and their corresponding interfaces that are tailored to
best leverage Llama Models.""", best leverage Llama Models.""",
), ),
include_standard_error_responses=True,
), ),
) )

View file

@ -10,6 +10,7 @@ import typing
from dataclasses import make_dataclass from dataclasses import make_dataclass
from typing import Any, Dict, Set, Union from typing import Any, Dict, Set, Union
from llama_stack.apis.datatypes import Error
from llama_stack.strong_typing.core import JsonType from llama_stack.strong_typing.core import JsonType
from llama_stack.strong_typing.docstring import Docstring, parse_type from llama_stack.strong_typing.docstring import Docstring, parse_type
from llama_stack.strong_typing.inspection import ( from llama_stack.strong_typing.inspection import (
@ -434,6 +435,75 @@ class Generator:
) )
self.schema_builder = SchemaBuilder(schema_generator) self.schema_builder = SchemaBuilder(schema_generator)
self.responses = {} self.responses = {}
# Create standard error responses
self._create_standard_error_responses()
def _create_standard_error_responses(self) -> None:
"""
Creates standard error responses that can be reused across operations.
These will be added to the components.responses section of the OpenAPI document.
"""
# Get the Error schema
error_schema = self.schema_builder.classdef_to_ref(Error)
# Create standard error responses
self.responses["BadRequest400"] = Response(
description="The request was invalid or malformed",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 400,
"title": "Bad Request",
"detail": "The request was invalid or malformed",
}
)
}
)
self.responses["TooManyRequests429"] = Response(
description="The client has sent too many requests in a given amount of time",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 429,
"title": "Too Many Requests",
"detail": "You have exceeded the rate limit. Please try again later.",
}
)
}
)
self.responses["InternalServerError500"] = Response(
description="The server encountered an unexpected error",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 500,
"title": "Internal Server Error",
"detail": "An unexpected error occurred. Our team has been notified.",
}
)
}
)
# Add a default error response for any unhandled error cases
self.responses["DefaultError"] = Response(
description="An unexpected error occurred",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 0,
"title": "Error",
"detail": "An unexpected error occurred",
}
)
}
)
def _build_type_tag(self, ref: str, schema: Schema) -> Tag: def _build_type_tag(self, ref: str, schema: Schema) -> Tag:
# Don't include schema definition in the tag description because for one, # Don't include schema definition in the tag description because for one,
@ -649,6 +719,18 @@ class Generator:
responses.update(response_builder.build_response(response_options)) responses.update(response_builder.build_response(response_options))
assert len(responses.keys()) > 0, f"No responses found for {op.name}" assert len(responses.keys()) > 0, f"No responses found for {op.name}"
# Add standard error response references
if self.options.include_standard_error_responses:
if "400" not in responses:
responses["400"] = ResponseRef("BadRequest400")
if "429" not in responses:
responses["429"] = ResponseRef("TooManyRequests429")
if "500" not in responses:
responses["500"] = ResponseRef("InternalServerError500")
if "default" not in responses:
responses["default"] = ResponseRef("DefaultError")
if op.event_type is not None: if op.event_type is not None:
builder = ContentBuilder(self.schema_builder) builder = ContentBuilder(self.schema_builder)
callbacks = { callbacks = {

View file

@ -35,6 +35,7 @@ class Options:
:param error_wrapper: True if errors are encapsulated in an error object wrapper. :param error_wrapper: True if errors are encapsulated in an error object wrapper.
:param property_description_fun: Custom transformation function to apply to class property documentation strings. :param property_description_fun: Custom transformation function to apply to class property documentation strings.
:param captions: User-defined captions for sections such as "Operations" or "Types", and (if applicable) groups of extra types. :param captions: User-defined captions for sections such as "Operations" or "Types", and (if applicable) groups of extra types.
:param include_standard_error_responses: Whether to include standard error responses (400, 429, 500, 503) in all operations.
""" """
server: Server server: Server
@ -52,6 +53,7 @@ class Options:
error_wrapper: bool = False error_wrapper: bool = False
property_description_fun: Optional[Callable[[type, str, str], str]] = None property_description_fun: Optional[Callable[[type, str, str], str]] = None
captions: Optional[Dict[str, str]] = None captions: Optional[Dict[str, str]] = None
include_standard_error_responses: bool = True
default_captions: ClassVar[Dict[str, str]] = { default_captions: ClassVar[Dict[str, str]] = {
"Operations": "Operations", "Operations": "Operations",

View file

@ -28,6 +28,5 @@ if [ ${#missing_packages[@]} -ne 0 ]; then
fi fi
stack_dir=$(dirname $(dirname $THIS_DIR)) stack_dir=$(dirname $(dirname $THIS_DIR))
models_dir=$(dirname $stack_dir)/llama-models PYTHONPATH=$PYTHONPATH:$stack_dir \
PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir \
python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static

View file

@ -11,3 +11,4 @@ sphinxcontrib-openapi
sphinxcontrib-redoc sphinxcontrib-redoc
sphinxcontrib-mermaid sphinxcontrib-mermaid
sphinxcontrib-video sphinxcontrib-video
tomli

View file

@ -0,0 +1,89 @@
# Llama Stack Agent Framework
The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI applications. This document explains the key components and how they work together.
## Core Concepts
### 1. Agent Configuration
Agents are configured using the `AgentConfig` class, which includes:
- **Model**: The underlying LLM to power the agent
- **Instructions**: System prompt that defines the agent's behavior
- **Tools**: Capabilities the agent can use to interact with external systems
- **Safety Shields**: Guardrails to ensure responsible AI behavior
```python
from llama_stack_client.lib.agents.agent import Agent
# Create the agent
agent = Agent(
llama_stack_client,
model="meta-llama/Llama-3-70b-chat",
instructions="You are a helpful assistant that can use tools to answer questions.",
tools=["builtin::code_interpreter", "builtin::rag/knowledge_search"],
)
```
### 2. Sessions
Agents maintain state through sessions, which represent a conversation thread:
```python
# Create a session
session_id = agent.create_session(session_name="My conversation")
```
### 3. Turns
Each interaction with an agent is called a "turn" and consists of:
- **Input Messages**: What the user sends to the agent
- **Steps**: The agent's internal processing (inference, tool execution, etc.)
- **Output Message**: The agent's response
```python
from llama_stack_client.lib.agents.event_logger import EventLogger
# Create a turn with streaming response
turn_response = agent.create_turn(
session_id=session_id,
messages=[{"role": "user", "content": "Tell me about Llama models"}],
)
for log in EventLogger().log(turn_response):
log.print()
```
### Non-Streaming
```python
from rich.pretty import pprint
# Non-streaming API
response = agent.create_turn(
session_id=session_id,
messages=[{"role": "user", "content": "Tell me about Llama models"}],
stream=False,
)
print("Inputs:")
pprint(response.input_messages)
print("Output:")
pprint(response.output_message.content)
print("Steps:")
pprint(response.steps)
```
### 4. Steps
Each turn consists of multiple steps that represent the agent's thought process:
- **Inference Steps**: The agent generating text responses
- **Tool Execution Steps**: The agent using tools to gather information
- **Shield Call Steps**: Safety checks being performed
## Agent Execution Loop
Refer to the [Agent Execution Loop](agent_execution_loop) for more details on what happens within an agent turn.

View file

@ -7,13 +7,13 @@ Each agent turn follows these key steps:
1. **Initial Safety Check**: The user's input is first screened through configured safety shields 1. **Initial Safety Check**: The user's input is first screened through configured safety shields
2. **Context Retrieval**: 2. **Context Retrieval**:
- If RAG is enabled, the agent queries relevant documents from memory banks - If RAG is enabled, the agent can choose to query relevant documents from memory banks. You can use the `instructions` field to steer the agent.
- For new documents, they are first inserted into the memory bank - For new documents, they are first inserted into the memory bank.
- Retrieved context is augmented to the user's prompt - Retrieved context is provided to the LLM as a tool response in the message history.
3. **Inference Loop**: The agent enters its main execution loop: 3. **Inference Loop**: The agent enters its main execution loop:
- The LLM receives the augmented prompt (with context and/or previous tool outputs) - The LLM receives a user prompt (with previous tool outputs)
- The LLM generates a response, potentially with tool calls - The LLM generates a response, potentially with [tool calls](tools)
- If tool calls are present: - If tool calls are present:
- Tool inputs are safety-checked - Tool inputs are safety-checked
- Tools are executed (e.g., web search, code execution) - Tools are executed (e.g., web search, code execution)
@ -40,19 +40,16 @@ sequenceDiagram
S->>E: Input Safety Check S->>E: Input Safety Check
deactivate S deactivate S
E->>M: 2.1 Query Context
M-->>E: 2.2 Retrieved Documents
loop Inference Loop loop Inference Loop
E->>L: 3.1 Augment with Context E->>L: 2.1 Augment with Context
L-->>E: 3.2 Response (with/without tool calls) L-->>E: 2.2 Response (with/without tool calls)
alt Has Tool Calls alt Has Tool Calls
E->>S: Check Tool Input E->>S: Check Tool Input
S->>T: 4.1 Execute Tool S->>T: 3.1 Execute Tool
T-->>E: 4.2 Tool Response T-->>E: 3.2 Tool Response
E->>L: 5.1 Tool Response E->>L: 4.1 Tool Response
L-->>E: 5.2 Synthesized Response L-->>E: 4.2 Synthesized Response
end end
opt Stop Conditions opt Stop Conditions
@ -64,23 +61,34 @@ sequenceDiagram
end end
E->>S: Output Safety Check E->>S: Output Safety Check
S->>U: 6. Final Response S->>U: 5. Final Response
``` ```
Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution: Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
```python ```python
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger from llama_stack_client.lib.agents.event_logger import EventLogger
from rich.pretty import pprint
agent_config = AgentConfig( # Replace host and port
client = LlamaStackClient(base_url=f"http://{HOST}:{PORT}")
agent = Agent(
client,
# Check with `llama-stack-client models list`
model="Llama3.2-3B-Instruct", model="Llama3.2-3B-Instruct",
instructions="You are a helpful assistant", instructions="You are a helpful assistant",
# Enable both RAG and tool usage # Enable both RAG and tool usage
toolgroups=[ tools=[
{"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}}, {
"name": "builtin::rag/knowledge_search",
"args": {"vector_db_ids": ["my_docs"]},
},
"builtin::code_interpreter", "builtin::code_interpreter",
], ],
# Configure safety # Configure safety (optional)
input_shields=["llama_guard"], input_shields=["llama_guard"],
output_shields=["llama_guard"], output_shields=["llama_guard"],
# Control the inference loop # Control the inference loop
@ -90,14 +98,12 @@ agent_config = AgentConfig(
"max_tokens": 2048, "max_tokens": 2048,
}, },
) )
agent = Agent(client, agent_config)
session_id = agent.create_session("monitored_session") session_id = agent.create_session("monitored_session")
# Stream the agent's execution steps # Stream the agent's execution steps
response = agent.create_turn( response = agent.create_turn(
messages=[{"role": "user", "content": "Analyze this code and run it"}], messages=[{"role": "user", "content": "Analyze this code and run it"}],
attachments=[ documents=[
{ {
"content": "https://raw.githubusercontent.com/example/code.py", "content": "https://raw.githubusercontent.com/example/code.py",
"mime_type": "text/plain", "mime_type": "text/plain",
@ -108,14 +114,21 @@ response = agent.create_turn(
# Monitor each step of execution # Monitor each step of execution
for log in EventLogger().log(response): for log in EventLogger().log(response):
if log.event.step_type == "memory_retrieval": log.print()
print("Retrieved context:", log.event.retrieved_context)
elif log.event.step_type == "inference": # Using non-streaming API, the response contains input, steps, and output.
print("LLM output:", log.event.model_response) response = agent.create_turn(
elif log.event.step_type == "tool_execution": messages=[{"role": "user", "content": "Analyze this code and run it"}],
print("Tool call:", log.event.tool_call) documents=[
print("Tool response:", log.event.tool_response) {
elif log.event.step_type == "shield_call": "content": "https://raw.githubusercontent.com/example/code.py",
if log.event.violation: "mime_type": "text/plain",
print("Safety violation:", log.event.violation) }
],
session_id=session_id,
)
pprint(f"Input: {response.input_messages}")
pprint(f"Output: {response.output_message.content}")
pprint(f"Steps: {response.steps}")
``` ```

View file

@ -1,170 +1,124 @@
# Evals # Evaluations
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing) The Llama Stack provides a set of APIs in Llama Stack for supporting running evaluations of LLM applications.
- `/datasetio` + `/datasets` API
- `/scoring` + `/scoring_functions` API
- `/eval` + `/benchmarks` API
Llama Stack provides the building blocks needed to run benchmark and application evaluations. This guide will walk you through how to use these components to run open benchmark evaluations. Visit our [Evaluation Concepts](../concepts/evaluation_concepts.md) guide for more details on how evaluations work in Llama Stack, and our [Evaluation Reference](../references/evals_reference/index.md) guide for a comprehensive reference on the APIs.
### 1. Open Benchmark Model Evaluation
This first example walks you through how to evaluate a model candidate served by Llama Stack on open benchmarks. We will use the following benchmark: This guides walks you through the process of evaluating an LLM application built using Llama Stack. Checkout the [Evaluation Reference](../references/evals_reference/index.md) guide goes over the sets of APIs and developer experience flow of using Llama Stack to run evaluations for benchmark and application use cases. Checkout our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing).
- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI): Benchmark designed to evaluate multimodal models.
- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
#### 1.1 Running MMMU
- We will use a pre-processed MMMU dataset from [llamastack/mmmu](https://huggingface.co/datasets/llamastack/mmmu). The preprocessing code is shown in in this [Github Gist](https://gist.github.com/yanxi0830/118e9c560227d27132a7fd10e2c92840). The dataset is obtained by transforming the original [MMMU/MMMU](https://huggingface.co/datasets/MMMU/MMMU) dataset into correct format by `inference/chat-completion` API.
## Application Evaluation
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.
In this example, we will show you how to:
1. Build an Agent with Llama Stack
2. Query the agent's sessions, turns, and steps
3. Evaluate the results.
##### Building a Search Agent
```python ```python
import datasets from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev") agent = Agent(
ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"]) client,
eval_rows = ds.to_pandas().to_dict(orient="records") model="meta-llama/Llama-3.3-70B-Instruct",
``` instructions="You are a helpful assistant. Use search tool to answer the questions. ",
tools=["builtin::websearch"],
- Next, we will run evaluation on an model candidate, we will need to:
- Define a system prompt
- Define an EvalCandidate
- Run evaluate on the dataset
```python
SYSTEM_PROMPT_TEMPLATE = """
You are an expert in Agriculture whose job is to answer questions from the user using images.
First, reason about the correct answer.
Then write the answer in the following format where X is exactly one of A,B,C,D:
Answer: X
Make sure X is one of A,B,C,D.
If you are uncertain of the correct answer, guess the most likely one.
"""
system_message = {
"role": "system",
"content": SYSTEM_PROMPT_TEMPLATE,
}
client.benchmarks.register(
benchmark_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
) )
user_prompts = [
"Which teams played in the NBA western conference finals of 2024. Search the web for the answer.",
"In which episode and season of South Park does Bill Cosby (BSM-471) first appear? Give me the number and title. Search the web for the answer.",
"What is the British-American kickboxer Andrew Tate's kickboxing name? Search the web for the answer.",
]
response = client.eval.evaluate_rows( session_id = agent.create_session("test-session")
benchmark_id="meta-reference::mmmu",
input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={
"type": "benchmark",
"eval_candidate": {
"type": "model",
"model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
"sampling_params": {
"strategy": {
"type": "greedy",
},
"max_tokens": 4096,
"repeat_penalty": 1.0,
},
"system_message": system_message,
},
},
)
```
#### 1.2. Running SimpleQA for prompt in user_prompts:
- We will use a pre-processed SimpleQA dataset from [llamastack/evals](https://huggingface.co/datasets/llamastack/evals/viewer/evals__simpleqa) which is obtained by transforming the input query into correct format accepted by `inference/chat-completion` API. response = agent.create_turn(
- Since we will be using this same dataset in our next example for Agentic evaluation, we will register it using the `/datasets` API, and interact with it through `/datasetio` API. messages=[
{
"role": "user",
"content": prompt,
}
],
session_id=session_id,
)
```python for log in EventLogger().log(response):
simpleqa_dataset_id = "huggingface::simpleqa" log.print()
_ = client.datasets.register(
dataset_id=simpleqa_dataset_id,
provider_id="huggingface",
url={"uri": "https://huggingface.co/datasets/llamastack/evals"},
metadata={
"path": "llamastack/evals",
"name": "evals__simpleqa",
"split": "train",
},
dataset_schema={
"input_query": {"type": "string"},
"expected_answer": {"type": "string"},
"chat_completion_input": {"type": "chat_completion_input"},
},
)
eval_rows = client.datasetio.get_rows_paginated(
dataset_id=simpleqa_dataset_id,
rows_in_page=5,
)
```
```python
client.benchmarks.register(
benchmark_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"],
)
response = client.eval.evaluate_rows(
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={
"type": "benchmark",
"eval_candidate": {
"type": "model",
"model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
"sampling_params": {
"strategy": {
"type": "greedy",
},
"max_tokens": 4096,
"repeat_penalty": 1.0,
},
},
},
)
``` ```
### 2. Agentic Evaluation ##### Query Agent Execution Steps
- In this example, we will demonstrate how to evaluate a agent candidate served by Llama Stack via `/agent` API.
- We will continue to use the SimpleQA dataset we used in previous example. Now, let's look deeper into the agent's execution steps and see if how well our agent performs.
- Instead of running evaluation on model, we will run the evaluation on a Search Agent with access to search tool. We will define our agent evaluation candidate through `AgentConfig`. ```python
# query the agents session
from rich.pretty import pprint
session_response = client.agents.session.retrieve(
session_id=session_id,
agent_id=agent.agent_id,
)
pprint(session_response)
```
As a sanity check, we will first check if all user prompts is followed by a tool call to `brave_search`.
```python
num_tool_call = 0
for turn in session_response.turns:
for step in turn.steps:
if (
step.step_type == "tool_execution"
and step.tool_calls[0].tool_name == "brave_search"
):
num_tool_call += 1
print(
f"{num_tool_call}/{len(session_response.turns)} user prompts are followed by a tool call to `brave_search`"
)
```
##### Evaluate Agent Responses
Now, we want to evaluate the agent's responses to the user prompts.
1. First, we will process the agent's execution history into a list of rows that can be used for evaluation.
2. Next, we will label the rows with the expected answer.
3. Finally, we will use the `/scoring` API to score the agent's responses.
```python ```python
agent_config = { eval_rows = []
"model": "meta-llama/Llama-3.1-405B-Instruct",
"instructions": "You are a helpful assistant", expected_answers = [
"sampling_params": { "Dallas Mavericks and the Minnesota Timberwolves",
"strategy": { "Season 4, Episode 12",
"type": "greedy", "King Cobra",
}, ]
},
"tools": [ for i, turn in enumerate(session_response.turns):
eval_rows.append(
{ {
"type": "brave_search", "input_query": turn.input_messages[0].content,
"engine": "tavily", "generated_answer": turn.output_message.content,
"api_key": userdata.get("TAVILY_SEARCH_API_KEY"), "expected_answer": expected_answers[i],
} }
], )
"tool_choice": "auto",
"tool_prompt_format": "json",
"input_shields": [],
"output_shields": [],
"enable_session_persistence": False,
}
response = client.eval.evaluate_rows( pprint(eval_rows)
benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows, scoring_params = {
scoring_functions=["llm-as-judge::405b-simpleqa"], "basic::subset_of": None,
task_config={ }
"type": "benchmark", scoring_response = client.scoring.score(
"eval_candidate": { input_rows=eval_rows, scoring_functions=scoring_params
"type": "agent",
"config": agent_config,
},
},
) )
pprint(scoring_response)
``` ```

View file

@ -1,30 +0,0 @@
## Testing & Evaluation
Llama Stack provides built-in tools for evaluating your applications:
1. **Benchmarking**: Test against standard datasets
2. **Application Evaluation**: Score your application's outputs
3. **Custom Metrics**: Define your own evaluation criteria
Here's how to set up basic evaluation:
```python
# Create an evaluation task
response = client.benchmarks.register(
benchmark_id="my_eval",
dataset_id="my_dataset",
scoring_functions=["accuracy", "relevance"],
)
# Run evaluation
job = client.eval.run_eval(
benchmark_id="my_eval",
task_config={
"type": "app",
"eval_candidate": {"type": "agent", "config": agent_config},
},
)
# Get results
result = client.eval.job_result(benchmark_id="my_eval", job_id=job.job_id)
```

View file

@ -8,22 +8,24 @@ The best way to get started is to look at this notebook which walks through the
Here are some key topics that will help you build effective agents: Here are some key topics that will help you build effective agents:
- **[Agent Execution Loop](agent_execution_loop)** - **[Agent](agent)**: Understand the components and design patterns of the Llama Stack agent framework.
- **[RAG](rag)** - **[Agent Execution Loop](agent_execution_loop)**: Understand how agents process information, make decisions, and execute actions in a continuous loop.
- **[Safety](safety)** - **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
- **[Tools](tools)** - **[Tools](tools)**: Extend your agents' capabilities by integrating with external tools and APIs.
- **[Telemetry](telemetry)** - **[Evals](evals)**: Evaluate your agents' effectiveness and identify areas for improvement.
- **[Evals](evals)** - **[Telemetry](telemetry)**: Monitor and analyze your agents' performance and behavior.
- **[Safety](safety)**: Implement guardrails and safety measures to ensure responsible AI behavior.
```{toctree} ```{toctree}
:hidden: :hidden:
:maxdepth: 1 :maxdepth: 1
agent
agent_execution_loop agent_execution_loop
rag rag
safety
tools tools
telemetry telemetry
evals evals
advanced_agent_patterns
safety
``` ```

View file

@ -1,8 +1,8 @@
## Using "Memory" or Retrieval Augmented Generation (RAG) ## Using Retrieval Augmented Generation (RAG)
Memory enables your applications to reference and recall information from previous interactions or external documents. RAG enables your applications to reference and recall information from previous interactions or external documents.
Llama Stack organizes the memory APIs into three layers: Llama Stack organizes the APIs that enable RAG into three layers:
- the lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.) - the lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.)
- next is the "Rag Tool", a first-class tool as part of the Tools API that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly. - next is the "Rag Tool", a first-class tool as part of the Tools API that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
- finally, it all comes together with the top-level "Agents" API that allows you to create agents that can use the tools to answer questions, perform tasks, and more. - finally, it all comes together with the top-level "Agents" API that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
@ -20,6 +20,11 @@ We may add more storage types like Graph IO in the future.
Here's how to set up a vector database for RAG: Here's how to set up a vector database for RAG:
```python ```python
# Create http client
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
# Register a vector db # Register a vector db
vector_db_id = "my_documents" vector_db_id = "my_documents"
response = client.vector_dbs.register( response = client.vector_dbs.register(
@ -81,27 +86,37 @@ results = client.tool_runtime.rag_tool.query(
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example: One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
```python ```python
from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.agent import Agent
# Configure agent with memory # Create agent with memory
agent_config = AgentConfig( agent = Agent(
model="meta-llama/Llama-3.2-3B-Instruct", client,
model="meta-llama/Llama-3.3-70B-Instruct",
instructions="You are a helpful assistant", instructions="You are a helpful assistant",
enable_session_persistence=False, tools=[
toolgroups=[
{ {
"name": "builtin::rag", "name": "builtin::rag/knowledge_search",
"args": { "args": {
"vector_db_ids": [vector_db_id], "vector_db_ids": [vector_db_id],
}, },
} }
], ],
) )
agent = Agent(client, agent_config)
session_id = agent.create_session("rag_session") session_id = agent.create_session("rag_session")
# Ask questions about documents in the vector db, and the agent will query the db to answer the question.
response = agent.create_turn(
messages=[{"role": "user", "content": "How to optimize memory in PyTorch?"}],
session_id=session_id,
)
```
> **NOTE:** the `instructions` field in the `AgentConfig` can be used to guide the agent's behavior. It is important to experiment with different instructions to see what works best for your use case.
You can also pass documents along with the user's message and ask questions about them.
```python
# Initial document ingestion # Initial document ingestion
response = agent.create_turn( response = agent.create_turn(
messages=[ messages=[
@ -109,7 +124,7 @@ response = agent.create_turn(
], ],
documents=[ documents=[
{ {
"content": "https://raw.githubusercontent.com/example/doc.rst", "content": "https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/memory_optimizations.rst",
"mime_type": "text/plain", "mime_type": "text/plain",
} }
], ],
@ -123,6 +138,14 @@ response = agent.create_turn(
) )
``` ```
You can print the response with below.
```python
from llama_stack_client.lib.agents.event_logger import EventLogger
for log in EventLogger().log(response):
log.print()
```
### Unregistering Vector DBs ### Unregistering Vector DBs
If you need to clean up and unregister vector databases, you can do so as follows: If you need to clean up and unregister vector databases, you can do so as follows:

View file

@ -5,7 +5,7 @@ An example of this would be a "db_access" tool group that contains tools for int
Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc. Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc.
When instatiating an agent, you can provide it a list of tool groups that it has access to. Agent gets the corresponding tool definitions for the specified tool groups and passes them along to the model. When instantiating an agent, you can provide it a list of tool groups that it has access to. Agent gets the corresponding tool definitions for the specified tool groups and passes them along to the model.
Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools. Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.
@ -60,7 +60,7 @@ Features:
- Disabled dangerous system operations - Disabled dangerous system operations
- Configurable execution timeouts - Configurable execution timeouts
> ⚠️ Important: The code interpreter tool can operate in a controlled enviroment locally or on Podman containers. To ensure proper functionality in containerised environments: > ⚠️ Important: The code interpreter tool can operate in a controlled environment locally or on Podman containers. To ensure proper functionality in containerized environments:
> - The container requires privileged access (e.g., --privileged). > - The container requires privileged access (e.g., --privileged).
> - Users without sufficient permissions may encounter permission errors. (`bwrap: Can't mount devpts on /newroot/dev/pts: Permission denied`) > - Users without sufficient permissions may encounter permission errors. (`bwrap: Can't mount devpts on /newroot/dev/pts: Permission denied`)
> - 🔒 Security Warning: Privileged mode grants elevated access and bypasses security restrictions. Use only in local, isolated, or controlled environments. > - 🔒 Security Warning: Privileged mode grants elevated access and bypasses security restrictions. Use only in local, isolated, or controlled environments.
@ -83,15 +83,15 @@ result = client.tool_runtime.invoke_tool(
) )
``` ```
#### Memory #### RAG
The Memory tool enables retrieval of context from various types of memory banks (vector, key-value, keyword, and graph). The RAG tool enables retrieval of context from various types of memory banks (vector, key-value, keyword, and graph).
```python ```python
# Register Memory tool group # Register Memory tool group
client.toolgroups.register( client.toolgroups.register(
toolgroup_id="builtin::memory", toolgroup_id="builtin::rag",
provider_id="memory", provider_id="faiss",
args={"max_chunks": 5, "max_tokens_in_context": 4096}, args={"max_chunks": 5, "max_tokens_in_context": 4096},
) )
``` ```
@ -102,7 +102,7 @@ Features:
- Context retrieval with token limits - Context retrieval with token limits
> **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and memory, that are provided by tavily-search, code-interpreter and memory providers. > **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and rag, that are provided by tavily-search, code-interpreter and rag providers.
## Model Context Protocol (MCP) Tools ## Model Context Protocol (MCP) Tools
@ -125,50 +125,31 @@ MCP tools require:
- Tools are discovered dynamically from the endpoint - Tools are discovered dynamically from the endpoint
## Tools provided by the client ## Adding Custom Tools
These tools are registered along with the agent config and are specific to the agent for which they are registered. The main difference between these tools and the tools provided by the built-in providers is that the execution of these tools is handled by the client and the agent transfers the tool call to the client and waits for the result from the client. When you want to use tools other than the built-in tools, you just need to implement a python function with a docstring. The content of the docstring will be used to describe the tool and the parameters and passed
along to the generative model.
```python
# Example tool definition
def my_tool(input: int) -> int:
"""
Runs my awesome tool.
:param input: some int parameter
"""
return input * 2
```
> **NOTE:** We employ python docstrings to describe the tool and the parameters. It is important to document the tool and the parameters so that the model can use the tool correctly. It is recommended to experiment with different docstrings to see how they affect the model's behavior.
Once defined, simply pass the tool to the agent config. `Agent` will take care of the rest (calling the model with the tool definition, executing the tool, and returning the result to the model for the next iteration).
```python ```python
# Example agent config with client provided tools # Example agent config with client provided tools
config = AgentConfig( agent = Agent(client, ..., tools=[my_tool])
toolgroups=[
"builtin::websearch",
],
client_tools=[ToolDef(name="client_tool", description="Client provided tool")],
)
``` ```
Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools. Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.
## Tool Structure
Each tool has the following components:
- `name`: Unique identifier for the tool
- `description`: Human-readable description of the tool's functionality
- `parameters`: List of parameters the tool accepts
- `name`: Parameter name
- `parameter_type`: Data type (string, number, etc.)
- `description`: Parameter description
- `required`: Whether the parameter is required (default: true)
- `default`: Default value if any
Example tool definition:
```python
{
"name": "web_search",
"description": "Search the web for information",
"parameters": [
{
"name": "query",
"parameter_type": "string",
"description": "The query to search for",
"required": True,
}
],
}
```
## Tool Invocation ## Tool Invocation
@ -201,10 +182,10 @@ group_tools = client.tools.list_tools(toolgroup_id="search_tools")
```python ```python
from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.types.agent_create_params import AgentConfig
# Configure the AI agent with necessary parameters # Instantiate the AI agent with the given configuration
agent_config = AgentConfig( agent = Agent(
client,
name="code-interpreter", name="code-interpreter",
description="A code interpreter agent for executing Python code snippets", description="A code interpreter agent for executing Python code snippets",
instructions=""" instructions="""
@ -212,14 +193,10 @@ agent_config = AgentConfig(
Always show the generated code, never generate your own code, and never anticipate results. Always show the generated code, never generate your own code, and never anticipate results.
""", """,
model="meta-llama/Llama-3.2-3B-Instruct", model="meta-llama/Llama-3.2-3B-Instruct",
toolgroups=["builtin::code_interpreter"], tools=["builtin::code_interpreter"],
max_infer_iters=5, max_infer_iters=5,
enable_session_persistence=False,
) )
# Instantiate the AI agent with the given configuration
agent = Agent(client, agent_config)
# Start a session # Start a session
session_id = agent.create_session("tool_session") session_id = agent.create_session("tool_session")

View file

@ -24,17 +24,58 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
- Associated with `Benchmark` resource. - Associated with `Benchmark` resource.
Use the following decision tree to decide how to use LlamaStack Evaluation flow. ## Open-benchmark Eval
![Eval Flow](../references/evals_reference/resources/eval-flow.png)
### List of open-benchmarks Llama Stack support
Llama stack pre-registers several popular open-benchmarks to easily evaluate model perfomance via CLI.
The list of open-benchmarks we currently support:
- [MMLU-COT](https://arxiv.org/abs/2009.03300) (Measuring Massive Multitask Language Understanding): Benchmark designed to comprehensively evaluate the breadth and depth of a model's academic and professional understanding
- [GPQA-COT](https://arxiv.org/abs/2311.12022) (A Graduate-Level Google-Proof Q&A Benchmark): A challenging benchmark of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry.
- [SimpleQA](https://openai.com/index/introducing-simpleqa/): Benchmark designed to access models to answer short, fact-seeking questions.
- [MMMU](https://arxiv.org/abs/2311.16502) (A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI)]: Benchmark designed to evaluate multimodal models.
```{admonition} Note on Benchmark v.s. Application Evaluation You can follow this [contributing guide](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) to add more open-benchmarks to Llama Stack
:class: tip
- **Benchmark Evaluation** is a well-defined eval-task consisting of `dataset` and `scoring_function`. The generation (inference or agent) will be done as part of evaluation. ### Run evaluation on open-benchmarks via CLI
- **Application Evaluation** assumes users already have app inputs & generated outputs. Evaluation will purely focus on scoring the generated outputs via scoring functions (e.g. LLM-as-judge).
We have built-in functionality to run the supported open-benckmarks using llama-stack-client CLI
#### Spin up Llama Stack server
Spin up llama stack server with 'open-benchmark' template
``` ```
llama stack run llama_stack/templates/open-benchmark/run.yaml
```
#### Run eval CLI
There are 3 necessary inputs to run a benchmark eval
- `list of benchmark_ids`: The list of benchmark ids to run evaluation on
- `model-id`: The model id to evaluate on
- `utput_dir`: Path to store the evaluate results
```
llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
--model_id <model id to evaluate on> \
--output_dir <directory to store the evaluate results> \
```
You can run
```
llama-stack-client eval run-benchmark help
```
to see the description of all the flags that eval run-benchmark has
In the output log, you can find the file path that has your evaluation results. Open that file and you can see you aggrgate
evaluation results over there.
## What's Next? ## What's Next?
- Check out our Colab notebook on working examples with evaluations [here](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing). - Check out our Colab notebook on working examples with running benchmark evaluations [here](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb#scrollTo=mxLCsP4MvFqP).
- Check out our [Building Applications - Evaluation](../building_applications/evals.md) guide for more details on how to use the Evaluation APIs to evaluate your applications.
- Check out our [Evaluation Reference](../references/evals_reference/index.md) for more details on the APIs. - Check out our [Evaluation Reference](../references/evals_reference/index.md) for more details on the APIs.

View file

@ -1,5 +1,13 @@
# Core Concepts # Core Concepts
```{toctree}
:maxdepth: 1
:hidden:
evaluation_concepts
```
Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks. Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
@ -26,7 +34,7 @@ We are working on adding a few more APIs to complete the application lifecycle.
The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include: The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.), - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.), - Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.) - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
Providers come in two flavors: Providers come in two flavors:

View file

@ -13,6 +13,19 @@
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
from docutils import nodes from docutils import nodes
from pathlib import Path
import requests
import json
# Read version from pyproject.toml
with Path(__file__).parent.parent.parent.joinpath("pyproject.toml").open("rb") as f:
pypi_url = "https://pypi.org/pypi/llama-stack/json"
version_tag = json.loads(requests.get(pypi_url).text)["info"]["version"]
print(f"{version_tag=}")
# generate the full link including text and url here
llama_stack_version_url = f"https://github.com/meta-llama/llama-stack/releases/tag/v{version_tag}"
llama_stack_version_link = f"<a href='{llama_stack_version_url}'>release notes</a>"
project = "llama-stack" project = "llama-stack"
copyright = "2025, Meta" copyright = "2025, Meta"
@ -66,6 +79,8 @@ myst_enable_extensions = [
myst_substitutions = { myst_substitutions = {
"docker_hub": "https://hub.docker.com/repository/docker/llamastack", "docker_hub": "https://hub.docker.com/repository/docker/llamastack",
"llama_stack_version": version_tag,
"llama_stack_version_link": llama_stack_version_link,
} }
suppress_warnings = ['myst.header'] suppress_warnings = ['myst.header']

View file

@ -17,25 +17,31 @@ Here are some example PRs to help you get started:
## Testing the Provider ## Testing the Provider
Before running tests, you must have required dependencies installed. This depends on the providers or distributions you are testing. For example, if you are testing the `together` distribution, you should install dependencies via `llama stack build --template together`.
### 1. Integration Testing ### 1. Integration Testing
- Create integration tests that use real provider instances and configurations
- For remote services, test actual API interactions
- Avoid mocking at the provider level since adapter layers tend to be thin
- Reference examples in {repopath}`tests/client-sdk`
### 2. Unit Testing (Optional) Integration tests are located in {repopath}`tests/integration`. These tests use the python client-SDK APIs (from the `llama_stack_client` package) to test functionality. Since these tests use client APIs, they can be run either by pointing to an instance of the Llama Stack server or "inline" by using `LlamaStackAsLibraryClient`.
- Add unit tests for provider-specific functionality
- See examples in {repopath}`llama_stack/providers/tests/inference/test_text_inference.py` Consult {repopath}`tests/integration/README.md` for more details on how to run the tests.
Note that each provider's `sample_run_config()` method (in the configuration class for that provider)
typically references some environment variables for specifying API keys and the like. You can set these in the environment or pass these via the `--env` flag to the test command.
### 2. Unit Testing
Unit tests are located in {repopath}`tests/unit`. Provider-specific unit tests are located in {repopath}`tests/unit/providers`. These tests are all run automatically as part of the CI process.
### 3. Additional end-to-end testing
### 3. End-to-End Testing
1. Start a Llama Stack server with your new provider 1. Start a Llama Stack server with your new provider
2. Test using client requests 2. Verify compatibility with existing client scripts in the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repository
3. Verify compatibility with existing client scripts in the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repository 3. Document which scripts are compatible with your provider
4. Document which scripts are compatible with your provider
## Submitting Your PR ## Submitting Your PR
1. Ensure all tests pass 1. Ensure all tests pass
2. Include a comprehensive test plan in your PR summary 2. Include a comprehensive test plan in your PR summary
3. Document any known limitations or considerations 3. Document any known limitations or considerations
4. Submit your pull request for review

View file

@ -4,6 +4,35 @@
This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers. This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.
### Setting your log level
In order to specify the proper logging level users can apply the following environment variable `LLAMA_STACK_LOGGING` with the following format:
`LLAMA_STACK_LOGGING=server=debug;core=info`
Where each category in the following list:
- all
- core
- server
- router
- inference
- agents
- safety
- eval
- tools
- client
Can be set to any of the following log levels:
- debug
- info
- warning
- error
- critical
The default global log level is `info`. `all` sets the log level for all components.
### Llama Stack Build ### Llama Stack Build
In order to build your own distribution, we recommend you clone the `llama-stack` repository. In order to build your own distribution, we recommend you clone the `llama-stack` repository.
@ -22,25 +51,25 @@ The main points to consider are:
``` ```
llama stack build -h llama stack build -h
usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only] [--run]
usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates]
[--image-type {conda,container,venv}] [--image-name IMAGE_NAME] [--print-deps-only]
Build a Llama stack container Build a Llama stack container
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack/distribution/**/build.yaml. --config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will
If this argument is not provided, you will be prompted to enter information interactively be prompted to enter information interactively (default: None)
--template TEMPLATE Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates --template TEMPLATE Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates (default: None)
--list-templates Show the available templates for building a Llama Stack distribution --list-templates Show the available templates for building a Llama Stack distribution (default: False)
--image-type {conda,container,venv} --image-type {conda,container,venv}
Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config. (default:
conda)
--image-name IMAGE_NAME --image-name IMAGE_NAME
[for image-type=conda] Name of the conda environment to use for the build. If [for image-type=conda|venv] Name of the conda or virtual environment to use for the build. If not specified, currently active Conda environment will be used if
not specified, currently active Conda environment will be used. If no Conda found. (default: None)
environment is active, you must specify a name. --print-deps-only Print the dependencies for the stack only, without building the stack (default: False)
--print-deps-only Print the dependencies for the stack only, without building the stack --run Run the stack after building using the same image type, name, and other applicable arguments (default: False)
``` ```
After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command. After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
@ -106,7 +135,7 @@ It would be best to start with a template and understand the structure of the co
llama stack build llama stack build
> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
> Enter the image type you want your Llama Stack to be built as (container or conda): conda > Enter the image type you want your Llama Stack to be built as (container or conda or venv): conda
Llama Stack is composed of several APIs working together. Let's select Llama Stack is composed of several APIs working together. Let's select
the provider types (implementations) you want to use for these APIs. the provider types (implementations) you want to use for these APIs.
@ -183,28 +212,28 @@ Now, let's start the Llama Stack Distribution Server. You will need the YAML con
``` ```
llama stack run -h llama stack run -h
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE] [--tls-certfile TLS_CERTFILE]
[--tls-certfile TLS_CERTFILE] [--image-type {conda,container,venv}] [--image-type {conda,container,venv}]
config config
start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution. Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
positional arguments: positional arguments:
config Path to config file to use for the run config Path to config file to use for the run
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--port PORT Port to run the server on. Defaults to 8321 --port PORT Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT. (default: 8321)
--image-name IMAGE_NAME --image-name IMAGE_NAME
Name of the image to run. Defaults to the current conda environment Name of the image to run. Defaults to the current conda environment (default: None)
--disable-ipv6 Disable IPv6 support --disable-ipv6 Disable IPv6 support (default: False)
--env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. --env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times. (default: [])
--tls-keyfile TLS_KEYFILE --tls-keyfile TLS_KEYFILE
Path to TLS key file for HTTPS Path to TLS key file for HTTPS (default: None)
--tls-certfile TLS_CERTFILE --tls-certfile TLS_CERTFILE
Path to TLS certificate file for HTTPS Path to TLS certificate file for HTTPS (default: None)
--image-type {conda,container,venv} --image-type {conda,container,venv}
Image Type used during the build. This can be either conda or container or venv. Image Type used during the build. This can be either conda or container or venv. (default: conda)
``` ```

View file

@ -17,26 +17,4 @@ $ llama-stack-client configure --endpoint https://llamastack-preview.fireworks.a
$ llama-stack-client models list $ llama-stack-client models list
``` ```
You will see outputs:
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-1B-Instruct | Llama3.2-1B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
```
Checkout the [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python/blob/main/docs/cli_reference.md) repo for more details on how to use the `llama-stack-client` CLI. Checkout [llama-stack-app](https://github.com/meta-llama/llama-stack-apps/tree/main) for examples applications built on top of Llama Stack. Checkout the [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python/blob/main/docs/cli_reference.md) repo for more details on how to use the `llama-stack-client` CLI. Checkout [llama-stack-app](https://github.com/meta-llama/llama-stack-apps/tree/main) for examples applications built on top of Llama Stack.

View file

@ -27,16 +27,19 @@ The following environment variables can be configured:
The following models are available by default: The following models are available by default:
- `meta-llama/Llama-3-8B-Instruct (meta/llama3-8b-instruct)` - `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
- `meta-llama/Llama-3-70B-Instruct (meta/llama3-70b-instruct)` - `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
- `meta-llama/Llama-3.1-8B-Instruct (meta/llama-3.1-8b-instruct)` - `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.1-70B-Instruct (meta/llama-3.1-70b-instruct)` - `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (meta/llama-3.1-405b-instruct)` - `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta-llama/Llama-3.2-1B-Instruct (meta/llama-3.2-1b-instruct)` - `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (meta/llama-3.2-3b-instruct)` - `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (meta/llama-3.2-11b-vision-instruct)` - `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (meta/llama-3.2-90b-vision-instruct)` - `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `baai/bge-m3 (baai/bge-m3)` - `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `
### Prerequisite: API Keys ### Prerequisite: API Keys

View file

@ -34,9 +34,9 @@ The following environment variables can be configured:
The following models are available by default: The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct (meta.llama3-1-8b-instruct-v1:0)` - `meta.llama3-1-8b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.1-70B-Instruct (meta.llama3-1-70b-instruct-v1:0)` - `meta.llama3-1-70b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (meta.llama3-1-405b-instruct-v1:0)` - `meta.llama3-1-405b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
### Prerequisite: API Keys ### Prerequisite: API Keys

View file

@ -27,8 +27,8 @@ The following environment variables can be configured:
The following models are available by default: The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct (llama3.1-8b)` - `llama3.1-8b (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct (llama-3.3-70b)` - `llama-3.3-70b (aliases: meta-llama/Llama-3.3-70B-Instruct)`
### Prerequisite: API Keys ### Prerequisite: API Keys

View file

@ -22,7 +22,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -37,17 +37,17 @@ The following environment variables can be configured:
The following models are available by default: The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct (accounts/fireworks/models/llama-v3p1-8b-instruct)` - `accounts/fireworks/models/llama-v3p1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.1-70B-Instruct (accounts/fireworks/models/llama-v3p1-70b-instruct)` - `accounts/fireworks/models/llama-v3p1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (accounts/fireworks/models/llama-v3p1-405b-instruct)` - `accounts/fireworks/models/llama-v3p1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta-llama/Llama-3.2-1B-Instruct (accounts/fireworks/models/llama-v3p2-1b-instruct)` - `accounts/fireworks/models/llama-v3p2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (accounts/fireworks/models/llama-v3p2-3b-instruct)` - `accounts/fireworks/models/llama-v3p2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (accounts/fireworks/models/llama-v3p2-11b-vision-instruct)` - `accounts/fireworks/models/llama-v3p2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (accounts/fireworks/models/llama-v3p2-90b-vision-instruct)` - `accounts/fireworks/models/llama-v3p2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct (accounts/fireworks/models/llama-v3p3-70b-instruct)` - `accounts/fireworks/models/llama-v3p3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `meta-llama/Llama-Guard-3-8B (accounts/fireworks/models/llama-guard-3-8b)` - `accounts/fireworks/models/llama-guard-3-8b (aliases: meta-llama/Llama-Guard-3-8B)`
- `meta-llama/Llama-Guard-3-11B-Vision (accounts/fireworks/models/llama-guard-3-11b-vision)` - `accounts/fireworks/models/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
- `nomic-ai/nomic-embed-text-v1.5 (nomic-ai/nomic-embed-text-v1.5)` - `nomic-ai/nomic-embed-text-v1.5 `
### Prerequisite: API Keys ### Prerequisite: API Keys

View file

@ -37,11 +37,11 @@ The following environment variables can be configured:
The following models are available by default: The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct (llama3-8b-8192)` - `groq/llama3-8b-8192 (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.1-8B-Instruct (llama-3.1-8b-instant)` - `groq/llama-3.1-8b-instant `
- `meta-llama/Llama-3-70B-Instruct (llama3-70b-8192)` - `groq/llama3-70b-8192 (aliases: meta-llama/Llama-3-70B-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct (llama-3.3-70b-versatile)` - `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (llama-3.2-3b-preview)` - `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)`
### Prerequisite: API Keys ### Prerequisite: API Keys

View file

@ -41,12 +41,31 @@ The following environment variables can be configured:
## Prerequisite: Downloading Models ## Prerequisite: Downloading Models
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
``` ```
$ ls ~/.llama/checkpoints $ llama model list --downloaded
Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M ┃ Model ┃ Size ┃ Modified Time ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
└─────────────────────────────────────────┴──────────┴─────────────────────┘
``` ```
## Running the Distribution ## Running the Distribution

View file

@ -41,12 +41,31 @@ The following environment variables can be configured:
## Prerequisite: Downloading Models ## Prerequisite: Downloading Models
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. Please use `llama model list --downloaded` to check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/download_models.html) here to download the models. Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
``` ```
$ ls ~/.llama/checkpoints $ llama model list --downloaded
Llama3.1-8B Llama3.2-11B-Vision-Instruct Llama3.2-1B-Instruct Llama3.2-90B-Vision-Instruct Llama-Guard-3-8B ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
Llama3.1-8B-Instruct Llama3.2-1B Llama3.2-3B-Instruct Llama-Guard-3-1B Prompt-Guard-86M ┃ Model ┃ Size ┃ Modified Time ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
└─────────────────────────────────────────┴──────────┴─────────────────────┘
``` ```
## Running the Distribution ## Running the Distribution

View file

@ -22,7 +22,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
| vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` | | vector_io | `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
@ -141,17 +141,21 @@ ollama run <model_name>
To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama. To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
``` ```
$ ollama ps $ ollama ps
NAME ID SIZE PROCESSOR UNTIL
NAME ID SIZE PROCESSOR UNTIL llama3.2:3b-instruct-fp16 195a8c01d91e 8.6 GB 100% GPU 9 minutes from now
llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now
``` ```
To verify that the model served by ollama is correctly connected to Llama Stack server To verify that the model served by ollama is correctly connected to Llama Stack server
```bash ```bash
$ llama-stack-client models list $ llama-stack-client models list
+----------------------+----------------------+---------------+-----------------------------------------------+
| identifier | llama_model | provider_id | metadata | Available Models
+======================+======================+===============+===============================================+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} | ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓
+----------------------+----------------------+---------------+-----------------------------------------------+ ┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ llm │ meta-llama/Llama-3.2-3B-Instruct │ llama3.2:3b-instruct-fp16 │ │ ollama │
└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘
Total models: 1
``` ```

View file

@ -21,7 +21,7 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |

View file

@ -34,15 +34,15 @@ The following environment variables can be configured:
The following models are available by default: The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct (Meta-Llama-3.1-8B-Instruct)` - `Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.1-70B-Instruct (Meta-Llama-3.1-70B-Instruct)` - `Meta-Llama-3.1-70B-Instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (Meta-Llama-3.1-405B-Instruct)` - `Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta-llama/Llama-3.2-1B-Instruct (Meta-Llama-3.2-1B-Instruct)` - `Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (Meta-Llama-3.2-3B-Instruct)` - `Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct (Meta-Llama-3.3-70B-Instruct)` - `Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (Llama-3.2-11B-Vision-Instruct)` - `Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (Llama-3.2-90B-Vision-Instruct)` - `Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta-llama/Llama-Guard-3-8B (Meta-Llama-Guard-3-8B)` - `Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
### Prerequisite: API Keys ### Prerequisite: API Keys

View file

@ -35,7 +35,7 @@ The following environment variables can be configured:
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) - `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`) - `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080}/v1`) - `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080/v1`)
- `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`) - `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) - `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)

View file

@ -22,7 +22,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
@ -37,17 +37,17 @@ The following environment variables can be configured:
The following models are available by default: The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct` - `meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.1-70B-Instruct` - `meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8` - `meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta-llama/Llama-3.2-3B-Instruct` - `meta-llama/Llama-3.2-3B-Instruct-Turbo (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct` - `meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct` - `meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct` - `meta-llama/Llama-3.3-70B-Instruct-Turbo (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `meta-llama/Llama-Guard-3-8B` - `meta-llama/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)`
- `meta-llama/Llama-Guard-3-11B-Vision` - `meta-llama/Llama-Guard-3-11B-Vision-Turbo (aliases: meta-llama/Llama-Guard-3-11B-Vision)`
- `togethercomputer/m2-bert-80M-8k-retrieval` - `togethercomputer/m2-bert-80M-8k-retrieval `
- `togethercomputer/m2-bert-80M-32k-retrieval` - `togethercomputer/m2-bert-80M-32k-retrieval `
### Prerequisite: API Keys ### Prerequisite: API Keys

View file

@ -38,7 +38,7 @@ The API is **exactly identical** for both clients.
:::{dropdown} Starting up the Llama Stack server :::{dropdown} Starting up the Llama Stack server
The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc. The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.
To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image. To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image. If you'd like to build your own image or customize the configurations, please check out [this guide](../references/index.md).
Lets setup some environment variables that we will use in the rest of the guide. Lets setup some environment variables that we will use in the rest of the guide.
```bash ```bash
@ -102,12 +102,18 @@ Let's use the `llama-stack-client` CLI to check the connectivity to the server.
$ llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT $ llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT
> Enter the API key (leave empty if no key is needed): > Enter the API key (leave empty if no key is needed):
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321 Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
$ llama-stack-client models list $ llama-stack-client models list
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ identifier ┃ provider_id ┃ provider_resource_id ┃ metadata ┃ Available Models
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
│ meta-llama/Llama-3.2-3B-Instruct │ ollama │ llama3.2:3b-instruct-fp16 │ │ ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓
└──────────────────────────────────┴─────────────┴───────────────────────────┴──────────┘ ┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ llm │ meta-llama/Llama-3.2-3B-Instruct │ llama3.2:3b-instruct-fp16 │ │ ollama │
└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘
Total models: 1
``` ```
You can test basic Llama inference completion using the CLI too. You can test basic Llama inference completion using the CLI too.
@ -178,7 +184,6 @@ from termcolor import cprint
from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.types import Document from llama_stack_client.types import Document
@ -235,27 +240,26 @@ client.tool_runtime.rag_tool.insert(
chunk_size_in_tokens=512, chunk_size_in_tokens=512,
) )
agent_config = AgentConfig( rag_agent = Agent(
client,
model=os.environ["INFERENCE_MODEL"], model=os.environ["INFERENCE_MODEL"],
# Define instructions for the agent ( aka system prompt) # Define instructions for the agent ( aka system prompt)
instructions="You are a helpful assistant", instructions="You are a helpful assistant",
enable_session_persistence=False, enable_session_persistence=False,
# Define tools available to the agent # Define tools available to the agent
toolgroups=[ tools=[
{ {
"name": "builtin::rag", "name": "builtin::rag/knowledge_search",
"args": { "args": {
"vector_db_ids": [vector_db_id], "vector_db_ids": [vector_db_id],
}, },
} }
], ],
) )
rag_agent = Agent(client, agent_config)
session_id = rag_agent.create_session("test-session") session_id = rag_agent.create_session("test-session")
user_prompts = [ user_prompts = [
"What are the top 5 topics that were explained? Only list succinct bullet points.", "How to optimize memory usage in torchtune? use the knowledge_search tool to get information.",
] ]
# Run the agent loop by calling the `create_turn` method # Run the agent loop by calling the `create_turn` method

View file

@ -1,8 +1,7 @@
```{admonition} News ```{admonition} News
:class: tip :class: tip
Llama Stack 0.1.4 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.4) for more details. Llama Stack {{ llama_stack_version }} is now available! See the {{ llama_stack_version_link }} for more details.
``` ```
# Llama Stack # Llama Stack
@ -69,6 +68,7 @@ A number of "adapters" are available for some popular Inference and Vector Store
| FAISS | Single Node | | FAISS | Single Node |
| SQLite-Vec| Single Node | | SQLite-Vec| Single Node |
| Chroma | Hosted and Single Node | | Chroma | Hosted and Single Node |
| Milvus | Hosted and Single Node |
| Postgres (PGVector) | Hosted and Single Node | | Postgres (PGVector) | Hosted and Single Node |
| Weaviate | Hosted | | Weaviate | Hosted |

View file

@ -2,7 +2,7 @@
The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include: The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.), - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.), - Vector databases (e.g., ChromaDB, Weaviate, Qdrant, Milvus, FAISS, PGVector, etc.),
- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.) - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
Providers come in two flavors: Providers come in two flavors:
@ -36,7 +36,7 @@ Evaluates the outputs of the system.
Collects telemetry data from the system. Collects telemetry data from the system.
## Tool Runtime ## Tool Runtime
Is associated with the ToolGroup resouces. Is associated with the ToolGroup resouces.
## Vector IO ## Vector IO
@ -55,5 +55,6 @@ vector_io/sqlite-vec
vector_io/chromadb vector_io/chromadb
vector_io/pgvector vector_io/pgvector
vector_io/qdrant vector_io/qdrant
vector_io/milvus
vector_io/weaviate vector_io/weaviate
``` ```

View file

@ -1,10 +1,10 @@
--- ---
orphan: true orphan: true
--- ---
# Chroma # Chroma
[Chroma](https://www.trychroma.com/) is an inline and remote vector [Chroma](https://www.trychroma.com/) is an inline and remote vector
database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database. database provider for Llama Stack. It allows you to store and query vectors directly within a Chroma database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features ## Features

View file

@ -3,7 +3,7 @@ orphan: true
--- ---
# Faiss # Faiss
[Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It [Faiss](https://github.com/facebookresearch/faiss) is an inline vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory. allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval. That means you'll get fast and efficient vector retrieval.
@ -29,5 +29,5 @@ You can install Faiss using pip:
pip install faiss-cpu pip install faiss-cpu
``` ```
## Documentation ## Documentation
See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for See [Faiss' documentation](https://faiss.ai/) or the [Faiss Wiki](https://github.com/facebookresearch/faiss/wiki) for
more details about Faiss in general. more details about Faiss in general.

View file

@ -0,0 +1,31 @@
---
orphan: true
---
# Milvus
[Milvus](https://milvus.io/) is an inline and remote vector database provider for Llama Stack. It
allows you to store and query vectors directly within a Milvus database.
That means you're not limited to storing vectors in memory or in a separate service.
## Features
- Easy to use
- Fully integrated with Llama Stack
## Usage
To use Milvus in your Llama Stack project, follow these steps:
1. Install the necessary dependencies.
2. Configure your Llama Stack project to use Milvus.
3. Start storing and querying vectors.
## Installation
You can install Milvus using pymilvus:
```bash
pip install pymilvus
```
## Documentation
See the [Milvus documentation](https://milvus.io/docs/install-overview.md) for more details about Milvus in general.

View file

@ -3,7 +3,7 @@ orphan: true
--- ---
# Postgres PGVector # Postgres PGVector
[PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It [PGVector](https://github.com/pgvector/pgvector) is a remote vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory. allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval. That means you'll get fast and efficient vector retrieval.

View file

@ -3,7 +3,7 @@ orphan: true
--- ---
# Qdrant # Qdrant
[Qdrant](https://qdrant.tech/documentation/) is a remote vector database provider for Llama Stack. It [Qdrant](https://qdrant.tech/documentation/) is a remote vector database provider for Llama Stack. It
allows you to store and query vectors directly in memory. allows you to store and query vectors directly in memory.
That means you'll get fast and efficient vector retrieval. That means you'll get fast and efficient vector retrieval.

View file

@ -3,8 +3,8 @@ orphan: true
--- ---
# SQLite-Vec # SQLite-Vec
[SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It [SQLite-Vec](https://github.com/asg017/sqlite-vec) is an inline vector database provider for Llama Stack. It
allows you to store and query vectors directly within an SQLite database. allows you to store and query vectors directly within an SQLite database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features ## Features

View file

@ -1,10 +1,10 @@
--- ---
orphan: true orphan: true
--- ---
# Weaviate # Weaviate
[Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack. [Weaviate](https://weaviate.io/) is a vector database provider for Llama Stack.
It allows you to store and query vectors directly within a Weaviate database. It allows you to store and query vectors directly within a Weaviate database.
That means you're not limited to storing vectors in memory or in a separate service. That means you're not limited to storing vectors in memory or in a separate service.
## Features ## Features
@ -27,7 +27,7 @@ To use Weaviate in your Llama Stack project, follow these steps:
## Installation ## Installation
To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart). To install Weaviate see the [Weaviate quickstart documentation](https://weaviate.io/developers/weaviate/quickstart).
## Documentation ## Documentation
See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general. See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more details about Weaviate in general.

View file

@ -24,19 +24,9 @@ The Evaluation APIs are associated with a set of Resources as shown in the follo
- Associated with `Benchmark` resource. - Associated with `Benchmark` resource.
Use the following decision tree to decide how to use LlamaStack Evaluation flow.
![Eval Flow](./resources/eval-flow.png)
```{admonition} Note on Benchmark v.s. Application Evaluation
:class: tip
- **Benchmark Evaluation** is a well-defined eval-task consisting of `dataset` and `scoring_function`. The generation (inference or agent) will be done as part of evaluation.
- **Application Evaluation** assumes users already have app inputs & generated outputs. Evaluation will purely focus on scoring the generated outputs via scoring functions (e.g. LLM-as-judge).
```
## Evaluation Examples Walkthrough ## Evaluation Examples Walkthrough
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
It is best to open this notebook in Colab to follow along with the examples. It is best to open this notebook in Colab to follow along with the examples.
@ -63,20 +53,29 @@ eval_rows = ds.to_pandas().to_dict(orient="records")
- Run evaluate on the dataset - Run evaluate on the dataset
```python ```python
from rich.pretty import pprint
from tqdm import tqdm
SYSTEM_PROMPT_TEMPLATE = """ SYSTEM_PROMPT_TEMPLATE = """
You are an expert in Agriculture whose job is to answer questions from the user using images. You are an expert in {subject} whose job is to answer questions from the user using images.
First, reason about the correct answer. First, reason about the correct answer.
Then write the answer in the following format where X is exactly one of A,B,C,D: Then write the answer in the following format where X is exactly one of A,B,C,D:
Answer: X Answer: X
Make sure X is one of A,B,C,D. Make sure X is one of A,B,C,D.
If you are uncertain of the correct answer, guess the most likely one. If you are uncertain of the correct answer, guess the most likely one.
""" """
system_message = { system_message = {
"role": "system", "role": "system",
"content": SYSTEM_PROMPT_TEMPLATE, "content": SYSTEM_PROMPT_TEMPLATE.format(subject=subset),
} }
# register the evaluation benchmark task with the dataset and scoring function
client.benchmarks.register( client.benchmarks.register(
benchmark_id="meta-reference::mmmu", benchmark_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}", dataset_id=f"mmmu-{subset}-{split}",
@ -87,14 +86,15 @@ response = client.eval.evaluate_rows(
benchmark_id="meta-reference::mmmu", benchmark_id="meta-reference::mmmu",
input_rows=eval_rows, input_rows=eval_rows,
scoring_functions=["basic::regex_parser_multiple_choice_answer"], scoring_functions=["basic::regex_parser_multiple_choice_answer"],
task_config={ benchmark_config={
"type": "benchmark",
"eval_candidate": { "eval_candidate": {
"type": "model", "type": "model",
"model": "meta-llama/Llama-3.2-90B-Vision-Instruct", "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
"sampling_params": { "sampling_params": {
"strategy": { "strategy": {
"type": "greedy", "type": "top_p",
"temperature": 1.0,
"top_p": 0.95,
}, },
"max_tokens": 4096, "max_tokens": 4096,
"repeat_penalty": 1.0, "repeat_penalty": 1.0,
@ -103,6 +103,7 @@ response = client.eval.evaluate_rows(
}, },
}, },
) )
pprint(response)
``` ```
#### 1.2. Running SimpleQA #### 1.2. Running SimpleQA
@ -115,10 +116,9 @@ simpleqa_dataset_id = "huggingface::simpleqa"
_ = client.datasets.register( _ = client.datasets.register(
dataset_id=simpleqa_dataset_id, dataset_id=simpleqa_dataset_id,
provider_id="huggingface", provider_id="huggingface",
url={"uri": "https://huggingface.co/datasets/llamastack/evals"}, url={"uri": "https://huggingface.co/datasets/llamastack/simpleqa"},
metadata={ metadata={
"path": "llamastack/evals", "path": "llamastack/simpleqa",
"name": "evals__simpleqa",
"split": "train", "split": "train",
}, },
dataset_schema={ dataset_schema={
@ -145,8 +145,7 @@ response = client.eval.evaluate_rows(
benchmark_id="meta-reference::simpleqa", benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows, input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"], scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={ benchmark_config={
"type": "benchmark",
"eval_candidate": { "eval_candidate": {
"type": "model", "type": "model",
"model": "meta-llama/Llama-3.2-90B-Vision-Instruct", "model": "meta-llama/Llama-3.2-90B-Vision-Instruct",
@ -160,6 +159,7 @@ response = client.eval.evaluate_rows(
}, },
}, },
) )
pprint(response)
``` ```
@ -170,19 +170,17 @@ response = client.eval.evaluate_rows(
```python ```python
agent_config = { agent_config = {
"model": "meta-llama/Llama-3.1-405B-Instruct", "model": "meta-llama/Llama-3.3-70B-Instruct",
"instructions": "You are a helpful assistant", "instructions": "You are a helpful assistant that have access to tool to search the web. ",
"sampling_params": { "sampling_params": {
"strategy": { "strategy": {
"type": "greedy", "type": "top_p",
}, "temperature": 0.5,
}, "top_p": 0.9,
"tools": [
{
"type": "brave_search",
"engine": "tavily",
"api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
} }
},
"toolgroups": [
"builtin::websearch",
], ],
"tool_choice": "auto", "tool_choice": "auto",
"tool_prompt_format": "json", "tool_prompt_format": "json",
@ -195,25 +193,22 @@ response = client.eval.evaluate_rows(
benchmark_id="meta-reference::simpleqa", benchmark_id="meta-reference::simpleqa",
input_rows=eval_rows.rows, input_rows=eval_rows.rows,
scoring_functions=["llm-as-judge::405b-simpleqa"], scoring_functions=["llm-as-judge::405b-simpleqa"],
task_config={ benchmark_config={
"type": "benchmark",
"eval_candidate": { "eval_candidate": {
"type": "agent", "type": "agent",
"config": agent_config, "config": agent_config,
}, },
}, },
) )
pprint(response)
``` ```
### 3. Agentic Application Dataset Scoring ### 3. Agentic Application Dataset Scoring
- Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
- In this example, we will work with an example RAG dataset and couple of scoring functions for evaluation. Llama Stack offers a library of scoring functions and the `/scoring` API, allowing you to run evaluations on your pre-annotated AI application datasets.
- `llm-as-judge::base`: LLM-As-Judge with custom judge prompt & model.
- `braintrust::factuality`: Factuality scorer from [braintrust](https://github.com/braintrustdata/autoevals).
- `basic::subset_of`: Basic checking if generated answer is a subset of expected answer.
- Please checkout our [Llama Stack Playground](https://llama-stack.readthedocs.io/en/latest/playground/index.html) for an interactive interface to upload datasets and run scorings. In this example, we will work with an example RAG dataset you have built previously, label with an annotation, and use LLM-As-Judge with custom judge prompt for scoring. Please checkout our [Llama Stack Playground](https://llama-stack.readthedocs.io/en/latest/playground/index.html) for an interactive interface to upload datasets and run scorings.
```python ```python
judge_model_id = "meta-llama/Llama-3.1-405B-Instruct-FP8" judge_model_id = "meta-llama/Llama-3.1-405B-Instruct-FP8"
@ -280,18 +275,25 @@ response = client.scoring.score(
The following examples give the quick steps to start running evaluations using the llama-stack-client CLI. The following examples give the quick steps to start running evaluations using the llama-stack-client CLI.
#### Benchmark Evaluation CLI #### Benchmark Evaluation CLI
Usage: There are 2 inputs necessary for running a benchmark eval There are 3 necessary input for running a benchmark eval
- `eval-task-id`: the identifier associated with the eval task. Each `Benchmark` is parametrized by - `list of benchmark_ids`: The list of benchmark ids to run evaluation on
- `dataset_id`: the identifier associated with the dataset. - `model-id`: The model id to evaluate on
- `List[scoring_function_id]`: list of scoring function identifiers. - `utput_dir`: Path to store the evaluate results
- `eval-task-config`: specifies the configuration of the model / agent to evaluate on. ```
llama-stack-client eval run-benchmark <benchmark_id_1> <benchmark_id_2> ... \
--model_id <model id to evaluate on> \
--output_dir <directory to store the evaluate results> \
```
You can run
```
llama-stack-client eval run-benchmark help
```
to see the description of all the flags to run benckmark eval
``` In the output log, you can find the path to the file that has your evaluation results. Open that file and you can see you aggrgate
llama-stack-client eval run_benchmark <eval-task-id> \ evaluation results over there.
--eval-task-config ~/benchmark_config.json \
--visualize
```
#### Application Evaluation CLI #### Application Evaluation CLI
@ -317,28 +319,9 @@ The `BenchmarkConfig` are user specified config to define:
2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`. 2. Optionally scoring function params to allow customization of scoring function behaviour. This is useful to parameterize generic scoring functions such as LLMAsJudge with custom `judge_model` / `judge_prompt`.
**Example Benchmark BenchmarkConfig** **Example BenchmarkConfig**
```json ```json
{ {
"type": "benchmark",
"eval_candidate": {
"type": "model",
"model": "Llama3.2-3B-Instruct",
"sampling_params": {
"strategy": {
"type": "greedy",
},
"max_tokens": 0,
"repetition_penalty": 1.0
}
}
}
```
**Example Application BenchmarkConfig**
```json
{
"type": "app",
"eval_candidate": { "eval_candidate": {
"type": "model", "type": "model",
"model": "Llama3.1-405B-Instruct", "model": "Llama3.1-405B-Instruct",
@ -362,3 +345,52 @@ The `BenchmarkConfig` are user specified config to define:
} }
} }
``` ```
## Open-benchmark Contributing Guide
### Create the new dataset for your new benchmark
An eval open-benchmark essentially contains 2 parts:
- `raw data`: The raw dataset associated with the benchmark. You typically need to search the original paper that introduces the benchmark and find the canonical dataset (usually hosted on huggingface)
- `prompt template`: How to ask the candidate model to generate the answer (prompt template plays a critical role to the evaluation results). Tyically, you can find the reference prompt template associated with the benchmark in benchmarks author's repo ([exmaple](https://github.com/idavidrein/gpqa/blob/main/prompts/chain_of_thought.txt)) or some other popular open source repos ([example](https://github.com/openai/simple-evals/blob/0a6e8f62e52bc5ae915f752466be3af596caf392/common.py#L14))
To create new open-benmark in llama stack, you need to combine the prompt template and the raw data into the `chat_completion_input` column in the evaluation dataset.
Llama stack enforeces the evaluate dataset schema to contain at least 3 columns:
- `chat_completion_input`: The actual input to the model to run the generation for eval
- `input_query`: The raw input from the raw dataset without the prompt template
- `expected_answer`: The ground truth for scoring functions to calcalate the score from.
You need to write a script [example convert script](https://gist.github.com/yanxi0830/118e9c560227d27132a7fd10e2c92840) to convert the benchmark raw dataset to llama stack format eval dataset and update the dataset to huggingface [example benchmark dataset](https://huggingface.co/datasets/llamastack/mmmu)
### Find scoring function for your new benchmark
The purpose of scoring function is to calculate the score for each example based on candidate model generation result and expected_answer. It also aggregates the scores from all the examples and generate the final evaluate results.
Firstly, you can see if the existing [llama stack scoring functions](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/scoring) can fulfill your need. If not, you need to write a new scoring function based on what benchmark author / other open source repo describe.
### Add new benchmark into template
Firstly, you need to add the evaluation dataset associated with your benchmark under `datasets` resource in the [open-benchmark](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/open-benchmark/run.yaml)
Secondly, you need to add the new benchmark you just created under the `benchmarks` resource in the same template. To add the new benchmark, you need to have
- `benchmark_id`: identifier of the benchmark
- `dataset_id`: identifier of the dataset associated with your benchmark
- `scoring_functions`: scoring function to calculate the score based on generation results and expected_answer
### Test the new benchmark
Spin up llama stack server with 'open-benchmark' templates
```
llama stack run llama_stack/templates/open-benchmark/run.yaml
```
Run eval benchmark CLI with your new benchmark id
```
llama-stack-client eval run-benchmark <new_benchmark_id> \
--model_id <model id to evaluate on> \
--output_dir <directory to store the evaluate results> \
```

View file

@ -129,3 +129,35 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
**Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens). **Important:** Set your environment variable `HF_TOKEN` or pass in `--hf-token` to the command to validate your access. You can find your token at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored. > **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
## List the downloaded models
To list the downloaded models with the following command:
```
llama model list --downloaded
```
You should see a table like this:
```
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃ Model ┃ Size ┃ Modified Time ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
└─────────────────────────────────────────┴──────────┴─────────────────────┘
```

View file

@ -154,6 +154,38 @@ llama download --source huggingface --model-id Prompt-Guard-86M --ignore-pattern
> **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored. > **Tip:** Default for `llama download` is to run with `--ignore-patterns *.safetensors` since we use the `.pth` files in the `original` folder. For Llama Guard and Prompt Guard, however, we need safetensors. Hence, please run with `--ignore-patterns original` so that safetensors are downloaded and `.pth` files are ignored.
## List the downloaded models
To list the downloaded models with the following command:
```
llama model list --downloaded
```
You should see a table like this:
```
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
┃ Model ┃ Size ┃ Modified Time ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
│ Llama3.2-1B-Instruct:int4-qlora-eo8 │ 1.53 GB │ 2025-02-26 11:22:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B │ 2.31 GB │ 2025-02-18 21:48:52 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Prompt-Guard-86M │ 0.02 GB │ 2025-02-26 11:29:28 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B-Instruct:int4-spinquant-eo8 │ 3.69 GB │ 2025-02-26 11:37:41 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-3B │ 5.99 GB │ 2025-02-18 21:51:26 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.1-8B │ 14.97 GB │ 2025-02-16 10:36:37 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama3.2-1B-Instruct:int4-spinquant-eo8 │ 1.51 GB │ 2025-02-26 11:35:02 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B │ 2.80 GB │ 2025-02-26 11:20:46 │
├─────────────────────────────────────────┼──────────┼─────────────────────┤
│ Llama-Guard-3-1B:int4 │ 0.43 GB │ 2025-02-26 11:33:33 │
└─────────────────────────────────────────┴──────────┴─────────────────────┘
```
## Understand the models ## Understand the models
The `llama model` command helps you explore the models interface. The `llama model` command helps you explore the models interface.

View file

@ -58,11 +58,15 @@ llama-stack-client providers list
llama-stack-client models list llama-stack-client models list
``` ```
``` ```
+----------------------+----------------------+---------------+----------------------------------------------------------+ Available Models
| identifier | llama_model | provider_id | metadata |
+======================+======================+===============+==========================================================+ ┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | tgi0 | {'huggingface_repo': 'meta-llama/Llama-3.1-8B-Instruct'} | ┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃
+----------------------+----------------------+---------------+----------------------------------------------------------+ ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ llm │ meta-llama/Llama-3.2-3B-Instruct │ llama3.2:3b-instruct-fp16 │ │ ollama │
└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘
Total models: 1
``` ```
### `llama-stack-client models get` ### `llama-stack-client models get`

View file

@ -294,8 +294,9 @@
" # Initialize custom tool (ensure `WebSearchTool` is defined earlier in the notebook)\n", " # Initialize custom tool (ensure `WebSearchTool` is defined earlier in the notebook)\n",
" webSearchTool = WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)\n", " webSearchTool = WebSearchTool(api_key=BRAVE_SEARCH_API_KEY)\n",
"\n", "\n",
" # Define the agent configuration, including the model and tool setup\n", " # Create an agent instance with the client and configuration\n",
" agent_config = AgentConfig(\n", " agent = Agent(\n",
" client, \n",
" model=MODEL_NAME,\n", " model=MODEL_NAME,\n",
" instructions=\"\"\"You are a helpful assistant that responds to user queries with relevant information and cites sources when available.\"\"\",\n", " instructions=\"\"\"You are a helpful assistant that responds to user queries with relevant information and cites sources when available.\"\"\",\n",
" sampling_params={\n", " sampling_params={\n",
@ -303,17 +304,12 @@
" \"type\": \"greedy\",\n", " \"type\": \"greedy\",\n",
" },\n", " },\n",
" },\n", " },\n",
" tools=[webSearchTool.get_tool_definition()],\n", " tools=[webSearchTool],\n",
" tool_choice=\"auto\",\n",
" tool_prompt_format=\"python_list\",\n",
" input_shields=input_shields,\n", " input_shields=input_shields,\n",
" output_shields=output_shields,\n", " output_shields=output_shields,\n",
" enable_session_persistence=False,\n", " enable_session_persistence=False,\n",
" )\n", " )\n",
"\n", "\n",
" # Create an agent instance with the client and configuration\n",
" agent = Agent(client, agent_config, [webSearchTool])\n",
"\n",
" # Create a session for interaction and print the session ID\n", " # Create a session for interaction and print the session ID\n",
" session_id = agent.create_session(\"test-session\")\n", " session_id = agent.create_session(\"test-session\")\n",
" print(f\"Created session_id={session_id} for Agent({agent.agent_id})\")\n", " print(f\"Created session_id={session_id} for Agent({agent.agent_id})\")\n",

View file

@ -110,12 +110,12 @@
"from llama_stack_client import LlamaStackClient\n", "from llama_stack_client import LlamaStackClient\n",
"from llama_stack_client.lib.agents.agent import Agent\n", "from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n", "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n",
"\n", "\n",
"\n", "\n",
"async def agent_example():\n", "async def agent_example():\n",
" client = LlamaStackClient(base_url=f\"http://{HOST}:{PORT}\")\n", " client = LlamaStackClient(base_url=f\"http://{HOST}:{PORT}\")\n",
" agent_config = AgentConfig(\n", " agent = Agent(\n",
" client, \n",
" model=MODEL_NAME,\n", " model=MODEL_NAME,\n",
" instructions=\"You are a helpful assistant! If you call builtin tools like brave search, follow the syntax brave_search.call(…)\",\n", " instructions=\"You are a helpful assistant! If you call builtin tools like brave search, follow the syntax brave_search.call(…)\",\n",
" sampling_params={\n", " sampling_params={\n",
@ -130,14 +130,7 @@
" \"api_key\": BRAVE_SEARCH_API_KEY,\n", " \"api_key\": BRAVE_SEARCH_API_KEY,\n",
" }\n", " }\n",
" ],\n", " ],\n",
" tool_choice=\"auto\",\n",
" tool_prompt_format=\"function_tag\",\n",
" input_shields=[],\n",
" output_shields=[],\n",
" enable_session_persistence=False,\n",
" )\n", " )\n",
"\n",
" agent = Agent(client, agent_config)\n",
" session_id = agent.create_session(\"test-session\")\n", " session_id = agent.create_session(\"test-session\")\n",
" print(f\"Created session_id={session_id} for Agent({agent.agent_id})\")\n", " print(f\"Created session_id={session_id} for Agent({agent.agent_id})\")\n",
"\n", "\n",

View file

@ -73,7 +73,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
Open a new terminal and install `llama-stack`: Open a new terminal and install `llama-stack`:
```bash ```bash
conda activate ollama conda activate ollama
pip install llama-stack==0.1.0 pip install -U llama-stack
``` ```
--- ---

View file

@ -103,7 +103,6 @@
"from llama_stack_client.lib.agents.agent import Agent\n", "from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n", "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import (\n", "from llama_stack_client.types.agent_create_params import (\n",
" AgentConfig,\n",
" AgentConfigToolSearchToolDefinition,\n", " AgentConfigToolSearchToolDefinition,\n",
")\n", ")\n",
"\n", "\n",
@ -117,7 +116,8 @@
") -> Agent:\n", ") -> Agent:\n",
" \"\"\"Create an agent with specified tools.\"\"\"\n", " \"\"\"Create an agent with specified tools.\"\"\"\n",
" print(\"Using the following model: \", model)\n", " print(\"Using the following model: \", model)\n",
" agent_config = AgentConfig(\n", " return Agent(\n",
" client, \n",
" model=model,\n", " model=model,\n",
" instructions=instructions,\n", " instructions=instructions,\n",
" sampling_params={\n", " sampling_params={\n",
@ -126,12 +126,7 @@
" },\n", " },\n",
" },\n", " },\n",
" tools=tools,\n", " tools=tools,\n",
" tool_choice=\"auto\",\n", " )\n"
" tool_prompt_format=\"json\",\n",
" enable_session_persistence=True,\n",
" )\n",
"\n",
" return Agent(client, agent_config)\n"
] ]
}, },
{ {
@ -360,9 +355,9 @@
" # Create the agent with the tool\n", " # Create the agent with the tool\n",
" weather_tool = WeatherTool()\n", " weather_tool = WeatherTool()\n",
"\n", "\n",
" agent_config = AgentConfig(\n", " agent = Agent(\n",
" client=client, \n",
" model=LLAMA31_8B_INSTRUCT,\n", " model=LLAMA31_8B_INSTRUCT,\n",
" # model=model_name,\n",
" instructions=\"\"\"\n", " instructions=\"\"\"\n",
" You are a weather assistant that can provide weather information.\n", " You are a weather assistant that can provide weather information.\n",
" Always specify the location clearly in your responses.\n", " Always specify the location clearly in your responses.\n",
@ -373,16 +368,9 @@
" \"type\": \"greedy\",\n", " \"type\": \"greedy\",\n",
" },\n", " },\n",
" },\n", " },\n",
" tools=[weather_tool.get_tool_definition()],\n", " tools=[weather_tool],\n",
" tool_choice=\"auto\",\n",
" tool_prompt_format=\"json\",\n",
" input_shields=[],\n",
" output_shields=[],\n",
" enable_session_persistence=True,\n",
" )\n", " )\n",
"\n", "\n",
" agent = Agent(client=client, agent_config=agent_config, custom_tools=[weather_tool])\n",
"\n",
" return agent\n", " return agent\n",
"\n", "\n",
"\n", "\n",

View file

@ -41,16 +41,36 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
class Attachment(BaseModel): class Attachment(BaseModel):
"""An attachment to an agent turn.
:param content: The content of the attachment.
:param mime_type: The MIME type of the attachment.
"""
content: InterleavedContent | URL content: InterleavedContent | URL
mime_type: str mime_type: str
class Document(BaseModel): class Document(BaseModel):
"""A document to be used by an agent.
:param content: The content of the document.
:param mime_type: The MIME type of the document.
"""
content: InterleavedContent | URL content: InterleavedContent | URL
mime_type: str mime_type: str
class StepCommon(BaseModel): class StepCommon(BaseModel):
"""A common step in an agent turn.
:param turn_id: The ID of the turn.
:param step_id: The ID of the step.
:param started_at: The time the step started.
:param completed_at: The time the step completed.
"""
turn_id: str turn_id: str
step_id: str step_id: str
started_at: Optional[datetime] = None started_at: Optional[datetime] = None
@ -58,6 +78,14 @@ class StepCommon(BaseModel):
class StepType(Enum): class StepType(Enum):
"""Type of the step in an agent turn.
:cvar inference: The step is an inference step that calls an LLM.
:cvar tool_execution: The step is a tool execution step that executes a tool call.
:cvar shield_call: The step is a shield call step that checks for safety violations.
:cvar memory_retrieval: The step is a memory retrieval step that retrieves context for vector dbs.
"""
inference = "inference" inference = "inference"
tool_execution = "tool_execution" tool_execution = "tool_execution"
shield_call = "shield_call" shield_call = "shield_call"
@ -66,6 +94,11 @@ class StepType(Enum):
@json_schema_type @json_schema_type
class InferenceStep(StepCommon): class InferenceStep(StepCommon):
"""An inference step in an agent turn.
:param model_response: The response from the LLM.
"""
model_config = ConfigDict(protected_namespaces=()) model_config = ConfigDict(protected_namespaces=())
step_type: Literal[StepType.inference.value] = StepType.inference.value step_type: Literal[StepType.inference.value] = StepType.inference.value
@ -74,6 +107,12 @@ class InferenceStep(StepCommon):
@json_schema_type @json_schema_type
class ToolExecutionStep(StepCommon): class ToolExecutionStep(StepCommon):
"""A tool execution step in an agent turn.
:param tool_calls: The tool calls to execute.
:param tool_responses: The tool responses from the tool calls.
"""
step_type: Literal[StepType.tool_execution.value] = StepType.tool_execution.value step_type: Literal[StepType.tool_execution.value] = StepType.tool_execution.value
tool_calls: List[ToolCall] tool_calls: List[ToolCall]
tool_responses: List[ToolResponse] tool_responses: List[ToolResponse]
@ -81,13 +120,25 @@ class ToolExecutionStep(StepCommon):
@json_schema_type @json_schema_type
class ShieldCallStep(StepCommon): class ShieldCallStep(StepCommon):
"""A shield call step in an agent turn.
:param violation: The violation from the shield call.
"""
step_type: Literal[StepType.shield_call.value] = StepType.shield_call.value step_type: Literal[StepType.shield_call.value] = StepType.shield_call.value
violation: Optional[SafetyViolation] violation: Optional[SafetyViolation]
@json_schema_type @json_schema_type
class MemoryRetrievalStep(StepCommon): class MemoryRetrievalStep(StepCommon):
"""A memory retrieval step in an agent turn.
:param vector_db_ids: The IDs of the vector databases to retrieve context from.
:param inserted_context: The context retrieved from the vector databases.
"""
step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value
# TODO: should this be List[str]?
vector_db_ids: str vector_db_ids: str
inserted_context: InterleavedContent inserted_context: InterleavedContent
@ -148,7 +199,7 @@ AgentToolGroup = register_schema(
class AgentConfigCommon(BaseModel): class AgentConfigCommon(BaseModel):
sampling_params: Optional[SamplingParams] = SamplingParams() sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
input_shields: Optional[List[str]] = Field(default_factory=list) input_shields: Optional[List[str]] = Field(default_factory=list)
output_shields: Optional[List[str]] = Field(default_factory=list) output_shields: Optional[List[str]] = Field(default_factory=list)
@ -296,16 +347,13 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
stream: Optional[bool] = False stream: Optional[bool] = False
tool_config: Optional[ToolConfig] = None tool_config: Optional[ToolConfig] = None
# TODO (xiyan): temporary flag, will remove for 0.1.5
allow_turn_resume: Optional[bool] = False
@json_schema_type @json_schema_type
class AgentTurnResumeRequest(BaseModel): class AgentTurnResumeRequest(BaseModel):
agent_id: str agent_id: str
session_id: str session_id: str
turn_id: str turn_id: str
tool_responses: List[ToolResponseMessage] tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]]
stream: Optional[bool] = False stream: Optional[bool] = False
@ -338,7 +386,13 @@ class Agents(Protocol):
async def create_agent( async def create_agent(
self, self,
agent_config: AgentConfig, agent_config: AgentConfig,
) -> AgentCreateResponse: ... ) -> AgentCreateResponse:
"""Create an agent with the given configuration.
:param agent_config: The configuration for the agent.
:returns: An AgentCreateResponse with the agent ID.
"""
...
@webmethod(route="/agents/{agent_id}/session/{session_id}/turn", method="POST") @webmethod(route="/agents/{agent_id}/session/{session_id}/turn", method="POST")
async def create_agent_turn( async def create_agent_turn(
@ -355,8 +409,19 @@ class Agents(Protocol):
documents: Optional[List[Document]] = None, documents: Optional[List[Document]] = None,
toolgroups: Optional[List[AgentToolGroup]] = None, toolgroups: Optional[List[AgentToolGroup]] = None,
tool_config: Optional[ToolConfig] = None, tool_config: Optional[ToolConfig] = None,
allow_turn_resume: Optional[bool] = False, ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]:
) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ... """Create a new turn for an agent.
:param agent_id: The ID of the agent to create the turn for.
:param session_id: The ID of the session to create the turn for.
:param messages: List of messages to start the turn with.
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
:param documents: (Optional) List of documents to create the turn with.
:param toolgroups: (Optional) List of toolgroups to create the turn with, will be used in addition to the agent's config toolgroups for the request.
:param tool_config: (Optional) The tool configuration to create the turn with, will be used to override the agent's tool_config.
:returns: If stream=False, returns a Turn object.
If stream=True, returns an SSE event stream of AgentTurnResponseStreamChunk
"""
@webmethod( @webmethod(
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume", route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/resume",
@ -367,7 +432,7 @@ class Agents(Protocol):
agent_id: str, agent_id: str,
session_id: str, session_id: str,
turn_id: str, turn_id: str,
tool_responses: List[ToolResponseMessage], tool_responses: Union[List[ToolResponse], List[ToolResponseMessage]],
stream: Optional[bool] = False, stream: Optional[bool] = False,
) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]:
"""Resume an agent turn with executed tool call responses. """Resume an agent turn with executed tool call responses.
@ -378,6 +443,7 @@ class Agents(Protocol):
:param session_id: The ID of the session to resume. :param session_id: The ID of the session to resume.
:param turn_id: The ID of the turn to resume. :param turn_id: The ID of the turn to resume.
:param tool_responses: The tool call responses to resume the turn with. :param tool_responses: The tool call responses to resume the turn with.
NOTE: ToolResponseMessage will be deprecated. Use ToolResponse.
:param stream: Whether to stream the response. :param stream: Whether to stream the response.
:returns: A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects. :returns: A Turn object if stream is False, otherwise an AsyncIterator of AgentTurnResponseStreamChunk objects.
""" """
@ -392,7 +458,15 @@ class Agents(Protocol):
agent_id: str, agent_id: str,
session_id: str, session_id: str,
turn_id: str, turn_id: str,
) -> Turn: ... ) -> Turn:
"""Retrieve an agent turn by its ID.
:param agent_id: The ID of the agent to get the turn for.
:param session_id: The ID of the session to get the turn for.
:param turn_id: The ID of the turn to get.
:returns: A Turn.
"""
...
@webmethod( @webmethod(
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}", route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}",
@ -404,14 +478,30 @@ class Agents(Protocol):
session_id: str, session_id: str,
turn_id: str, turn_id: str,
step_id: str, step_id: str,
) -> AgentStepResponse: ... ) -> AgentStepResponse:
"""Retrieve an agent step by its ID.
:param agent_id: The ID of the agent to get the step for.
:param session_id: The ID of the session to get the step for.
:param turn_id: The ID of the turn to get the step for.
:param step_id: The ID of the step to get.
:returns: An AgentStepResponse.
"""
...
@webmethod(route="/agents/{agent_id}/session", method="POST") @webmethod(route="/agents/{agent_id}/session", method="POST")
async def create_agent_session( async def create_agent_session(
self, self,
agent_id: str, agent_id: str,
session_name: str, session_name: str,
) -> AgentSessionCreateResponse: ... ) -> AgentSessionCreateResponse:
"""Create a new session for an agent.
:param agent_id: The ID of the agent to create the session for.
:param session_name: The name of the session to create.
:returns: An AgentSessionCreateResponse.
"""
...
@webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET") @webmethod(route="/agents/{agent_id}/session/{session_id}", method="GET")
async def get_agents_session( async def get_agents_session(
@ -419,17 +509,35 @@ class Agents(Protocol):
session_id: str, session_id: str,
agent_id: str, agent_id: str,
turn_ids: Optional[List[str]] = None, turn_ids: Optional[List[str]] = None,
) -> Session: ... ) -> Session:
"""Retrieve an agent session by its ID.
:param session_id: The ID of the session to get.
:param agent_id: The ID of the agent to get the session for.
:param turn_ids: (Optional) List of turn IDs to filter the session by.
"""
...
@webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE") @webmethod(route="/agents/{agent_id}/session/{session_id}", method="DELETE")
async def delete_agents_session( async def delete_agents_session(
self, self,
session_id: str, session_id: str,
agent_id: str, agent_id: str,
) -> None: ... ) -> None:
"""Delete an agent session by its ID.
:param session_id: The ID of the session to delete.
:param agent_id: The ID of the agent to delete the session for.
"""
...
@webmethod(route="/agents/{agent_id}", method="DELETE") @webmethod(route="/agents/{agent_id}", method="DELETE")
async def delete_agent( async def delete_agent(
self, self,
agent_id: str, agent_id: str,
) -> None: ... ) -> None:
"""Delete an agent by its ID.
:param agent_id: The ID of the agent to delete.
"""
...

View file

@ -40,7 +40,7 @@ class BatchInference(Protocol):
self, self,
model: str, model: str,
content_batch: List[InterleavedContent], content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = None,
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> BatchCompletionResponse: ... ) -> BatchCompletionResponse: ...
@ -50,7 +50,7 @@ class BatchInference(Protocol):
self, self,
model: str, model: str,
messages_batch: List[List[Message]], messages_batch: List[List[Message]],
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = None,
# zero-shot tool definitions as input to the model # zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = list, tools: Optional[List[ToolDefinition]] = list,
tool_choice: Optional[ToolChoice] = ToolChoice.auto, tool_choice: Optional[ToolChoice] = ToolChoice.auto,

View file

@ -14,6 +14,14 @@ from llama_stack.schema_utils import json_schema_type, webmethod
@json_schema_type @json_schema_type
class PaginatedRowsResult(BaseModel): class PaginatedRowsResult(BaseModel):
"""
A paginated list of rows from a dataset.
:param rows: The rows in the current page.
:param total_count: The total number of rows in the dataset.
:param next_page_token: The token to get the next page of rows.
"""
# the rows obey the DatasetSchema for the given dataset # the rows obey the DatasetSchema for the given dataset
rows: List[Dict[str, Any]] rows: List[Dict[str, Any]]
total_count: int total_count: int
@ -36,7 +44,15 @@ class DatasetIO(Protocol):
rows_in_page: int, rows_in_page: int,
page_token: Optional[str] = None, page_token: Optional[str] = None,
filter_condition: Optional[str] = None, filter_condition: Optional[str] = None,
) -> PaginatedRowsResult: ... ) -> PaginatedRowsResult:
"""Get a paginated list of rows from a dataset.
:param dataset_id: The ID of the dataset to get the rows from.
:param rows_in_page: The number of rows to get per page.
:param page_token: The token to get the next page of rows.
:param filter_condition: (Optional) A condition to filter the rows by.
"""
...
@webmethod(route="/datasetio/rows", method="POST") @webmethod(route="/datasetio/rows", method="POST")
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ... async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...

View file

@ -5,6 +5,9 @@
# the root directory of this source tree. # the root directory of this source tree.
from enum import Enum from enum import Enum
from typing import Optional
from pydantic import BaseModel
from llama_stack.schema_utils import json_schema_type from llama_stack.schema_utils import json_schema_type
@ -33,3 +36,20 @@ class Api(Enum):
# built-in API # built-in API
inspect = "inspect" inspect = "inspect"
@json_schema_type
class Error(BaseModel):
"""
Error response from the API. Roughly follows RFC 7807.
:param status: HTTP status code
:param title: Error title, a short summary of the error which is invariant for an error type
:param detail: Error detail, a longer human-readable description of the error
:param instance: (Optional) A URL which can be used to retrieve more information about the specific occurrence of the error
"""
status: int
title: str
detail: str
instance: Optional[str] = None

View file

@ -19,6 +19,13 @@ from llama_stack.schema_utils import json_schema_type, register_schema, webmetho
@json_schema_type @json_schema_type
class ModelCandidate(BaseModel): class ModelCandidate(BaseModel):
"""A model candidate for evaluation.
:param model: The model ID to evaluate.
:param sampling_params: The sampling parameters for the model.
:param system_message: (Optional) The system message providing instructions or context to the model.
"""
type: Literal["model"] = "model" type: Literal["model"] = "model"
model: str model: str
sampling_params: SamplingParams sampling_params: SamplingParams
@ -27,6 +34,11 @@ class ModelCandidate(BaseModel):
@json_schema_type @json_schema_type
class AgentCandidate(BaseModel): class AgentCandidate(BaseModel):
"""An agent candidate for evaluation.
:param config: The configuration for the agent candidate.
"""
type: Literal["agent"] = "agent" type: Literal["agent"] = "agent"
config: AgentConfig config: AgentConfig
@ -39,6 +51,13 @@ EvalCandidate = register_schema(
@json_schema_type @json_schema_type
class BenchmarkConfig(BaseModel): class BenchmarkConfig(BaseModel):
"""A benchmark configuration for evaluation.
:param eval_candidate: The candidate to evaluate.
:param scoring_params: Map between scoring function id and parameters for each scoring function you want to run
:param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated
"""
eval_candidate: EvalCandidate eval_candidate: EvalCandidate
scoring_params: Dict[str, ScoringFnParams] = Field( scoring_params: Dict[str, ScoringFnParams] = Field(
description="Map between scoring function id and parameters for each scoring function you want to run", description="Map between scoring function id and parameters for each scoring function you want to run",
@ -53,18 +72,32 @@ class BenchmarkConfig(BaseModel):
@json_schema_type @json_schema_type
class EvaluateResponse(BaseModel): class EvaluateResponse(BaseModel):
"""The response from an evaluation.
:param generations: The generations from the evaluation.
:param scores: The scores from the evaluation.
"""
generations: List[Dict[str, Any]] generations: List[Dict[str, Any]]
# each key in the dict is a scoring function name # each key in the dict is a scoring function name
scores: Dict[str, ScoringResult] scores: Dict[str, ScoringResult]
class Eval(Protocol): class Eval(Protocol):
"""Llama Stack Evaluation API for running evaluations on model and agent candidates."""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST")
async def run_eval( async def run_eval(
self, self,
benchmark_id: str, benchmark_id: str,
task_config: BenchmarkConfig, benchmark_config: BenchmarkConfig,
) -> Job: ... ) -> Job:
"""Run an evaluation on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param benchmark_config: The configuration for the benchmark.
:return: The job that was created to run the evaluation.
"""
@webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST")
async def evaluate_rows( async def evaluate_rows(
@ -72,14 +105,41 @@ class Eval(Protocol):
benchmark_id: str, benchmark_id: str,
input_rows: List[Dict[str, Any]], input_rows: List[Dict[str, Any]],
scoring_functions: List[str], scoring_functions: List[str],
task_config: BenchmarkConfig, benchmark_config: BenchmarkConfig,
) -> EvaluateResponse: ... ) -> EvaluateResponse:
"""Evaluate a list of rows on a benchmark.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param input_rows: The rows to evaluate.
:param scoring_functions: The scoring functions to use for the evaluation.
:param benchmark_config: The configuration for the benchmark.
:return: EvaluateResponse object containing generations and scores
"""
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET")
async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]: ... async def job_status(self, benchmark_id: str, job_id: str) -> Optional[JobStatus]:
"""Get the status of a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to get the status of.
:return: The status of the evaluationjob.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE")
async def job_cancel(self, benchmark_id: str, job_id: str) -> None: ... async def job_cancel(self, benchmark_id: str, job_id: str) -> None:
"""Cancel a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to cancel.
"""
...
@webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET")
async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: ... async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse:
"""Get the result of a job.
:param benchmark_id: The ID of the benchmark to run the evaluation on.
:param job_id: The ID of the job to get the result of.
:return: The result of the job.
"""

View file

@ -278,7 +278,7 @@ ResponseFormat = register_schema(
class CompletionRequest(BaseModel): class CompletionRequest(BaseModel):
model: str model: str
content: InterleavedContent content: InterleavedContent
sampling_params: Optional[SamplingParams] = SamplingParams() sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
response_format: Optional[ResponseFormat] = None response_format: Optional[ResponseFormat] = None
stream: Optional[bool] = False stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None logprobs: Optional[LogProbConfig] = None
@ -357,7 +357,7 @@ class ToolConfig(BaseModel):
class ChatCompletionRequest(BaseModel): class ChatCompletionRequest(BaseModel):
model: str model: str
messages: List[Message] messages: List[Message]
sampling_params: Optional[SamplingParams] = SamplingParams() sampling_params: Optional[SamplingParams] = Field(default_factory=SamplingParams)
tools: Optional[List[ToolDefinition]] = Field(default_factory=list) tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_config: Optional[ToolConfig] = Field(default_factory=ToolConfig) tool_config: Optional[ToolConfig] = Field(default_factory=ToolConfig)
@ -444,7 +444,7 @@ class Inference(Protocol):
self, self,
model_id: str, model_id: str,
content: InterleavedContent, content: InterleavedContent,
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = None,
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
@ -467,7 +467,7 @@ class Inference(Protocol):
self, self,
model_id: str, model_id: str,
messages: List[Message], messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = None,
tools: Optional[List[ToolDefinition]] = None, tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = ToolChoice.auto, tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None, tool_prompt_format: Optional[ToolPromptFormat] = None,

View file

@ -17,6 +17,13 @@ ScoringResultRow = Dict[str, Any]
@json_schema_type @json_schema_type
class ScoringResult(BaseModel): class ScoringResult(BaseModel):
"""
A scoring result for a single row.
:param score_rows: The scoring result for each row. Each row is a map of column name to value.
:param aggregated_results: Map of metric name to aggregated value
"""
score_rows: List[ScoringResultRow] score_rows: List[ScoringResultRow]
# aggregated metrics to value # aggregated metrics to value
aggregated_results: Dict[str, Any] aggregated_results: Dict[str, Any]
@ -30,6 +37,12 @@ class ScoreBatchResponse(BaseModel):
@json_schema_type @json_schema_type
class ScoreResponse(BaseModel): class ScoreResponse(BaseModel):
"""
The response from scoring.
:param results: A map of scoring function name to ScoringResult.
"""
# each key in the dict is a scoring function name # each key in the dict is a scoring function name
results: Dict[str, ScoringResult] results: Dict[str, ScoringResult]
@ -55,4 +68,11 @@ class Scoring(Protocol):
self, self,
input_rows: List[Dict[str, Any]], input_rows: List[Dict[str, Any]],
scoring_functions: Dict[str, Optional[ScoringFnParams]], scoring_functions: Dict[str, Optional[ScoringFnParams]],
) -> ScoreResponse: ... ) -> ScoreResponse:
"""Score a list of rows.
:param input_rows: The rows to score.
:param scoring_functions: The scoring functions to use for the scoring.
:return: ScoreResponse object containing rows and aggregated results
"""
...

View file

@ -9,6 +9,7 @@ import argparse
from .download import Download from .download import Download
from .model import ModelParser from .model import ModelParser
from .stack import StackParser from .stack import StackParser
from .stack.utils import print_subcommand_description
from .verify_download import VerifyDownload from .verify_download import VerifyDownload
@ -20,6 +21,7 @@ class LlamaCLIParser:
prog="llama", prog="llama",
description="Welcome to the Llama CLI", description="Welcome to the Llama CLI",
add_help=True, add_help=True,
formatter_class=argparse.RawTextHelpFormatter,
) )
# Default command is to print help # Default command is to print help
@ -33,6 +35,8 @@ class LlamaCLIParser:
Download.create(subparsers) Download.create(subparsers)
VerifyDownload.create(subparsers) VerifyDownload.create(subparsers)
print_subcommand_description(self.parser, subparsers)
def parse_args(self) -> argparse.Namespace: def parse_args(self) -> argparse.Namespace:
return self.parser.parse_args() return self.parser.parse_args()

View file

@ -64,7 +64,7 @@ class ModelDescribe(Subcommand):
] ]
if model.recommended_sampling_params is not None: if model.recommended_sampling_params is not None:
sampling_params = model.recommended_sampling_params.dict() sampling_params = model.recommended_sampling_params.model_dump()
for k in ("max_tokens", "repetition_penalty"): for k in ("max_tokens", "repetition_penalty"):
del sampling_params[k] del sampling_params[k]
rows.append( rows.append(

View file

@ -12,6 +12,7 @@ from llama_stack.cli.model.list import ModelList
from llama_stack.cli.model.prompt_format import ModelPromptFormat from llama_stack.cli.model.prompt_format import ModelPromptFormat
from llama_stack.cli.model.remove import ModelRemove from llama_stack.cli.model.remove import ModelRemove
from llama_stack.cli.model.verify_download import ModelVerifyDownload from llama_stack.cli.model.verify_download import ModelVerifyDownload
from llama_stack.cli.stack.utils import print_subcommand_description
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
@ -24,6 +25,7 @@ class ModelParser(Subcommand):
"model", "model",
prog="llama model", prog="llama model",
description="Work with llama models", description="Work with llama models",
formatter_class=argparse.RawTextHelpFormatter,
) )
self.parser.set_defaults(func=lambda args: self.parser.print_help()) self.parser.set_defaults(func=lambda args: self.parser.print_help())
@ -37,3 +39,5 @@ class ModelParser(Subcommand):
ModelDescribe.create(subparsers) ModelDescribe.create(subparsers)
ModelVerifyDownload.create(subparsers) ModelVerifyDownload.create(subparsers)
ModelRemove.create(subparsers) ModelRemove.create(subparsers)
print_subcommand_description(self.parser, subparsers)

View file

@ -7,10 +7,14 @@
import argparse import argparse
import textwrap import textwrap
from io import StringIO from io import StringIO
from pathlib import Path
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
from llama_stack.cli.table import print_table
from llama_stack.models.llama.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family from llama_stack.models.llama.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
ROOT_DIR = Path(__file__).parent.parent.parent
class ModelPromptFormat(Subcommand): class ModelPromptFormat(Subcommand):
"""Llama model cli for describe a model prompt format (message formats)""" """Llama model cli for describe a model prompt format (message formats)"""
@ -40,6 +44,12 @@ class ModelPromptFormat(Subcommand):
default="llama3_1", default="llama3_1",
help="Model Family (llama3_1, llama3_X, etc.)", help="Model Family (llama3_1, llama3_X, etc.)",
) )
self.parser.add_argument(
"-l",
"--list",
action="store_true",
help="List all available models",
)
def _run_model_template_cmd(self, args: argparse.Namespace) -> None: def _run_model_template_cmd(self, args: argparse.Namespace) -> None:
import importlib.resources import importlib.resources
@ -48,7 +58,26 @@ class ModelPromptFormat(Subcommand):
supported_model_ids = [ supported_model_ids = [
m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2} m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
] ]
model_str = "\n".join([m.value for m in supported_model_ids])
model_list = [m.value for m in supported_model_ids]
model_str = "\n".join(model_list)
if args.list:
headers = ["Model(s)"]
rows = []
for m in model_list:
rows.append(
[
m,
]
)
print_table(
rows,
headers,
separate_rows=True,
)
return
try: try:
model_id = CoreModelId(args.model_name) model_id = CoreModelId(args.model_name)
except ValueError: except ValueError:
@ -57,9 +86,9 @@ class ModelPromptFormat(Subcommand):
if model_id not in supported_model_ids: if model_id not in supported_model_ids:
self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}") self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}")
llama_3_1_file = importlib.resources.files("llama_models") / "llama3_1/prompt_format.md" llama_3_1_file = ROOT_DIR / "models" / "llama" / "llama3_1" / "prompt_format.md"
llama_3_2_text_file = importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md" llama_3_2_text_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "text_prompt_format.md"
llama_3_2_vision_file = importlib.resources.files("llama_models") / "llama3_2/vision_prompt_format.md" llama_3_2_vision_file = ROOT_DIR / "models" / "llama" / "llama3_2" / "vision_prompt_format.md"
if model_family(model_id) == ModelFamily.llama3_1: if model_family(model_id) == ModelFamily.llama3_1:
with importlib.resources.as_file(llama_3_1_file) as f: with importlib.resources.as_file(llama_3_1_file) as f:
content = f.open("r").read() content = f.open("r").read()

View file

@ -38,7 +38,7 @@ from llama_stack.distribution.distribution import get_provider_registry
from llama_stack.distribution.resolver import InvalidProviderError from llama_stack.distribution.resolver import InvalidProviderError
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.distribution.utils.exec import formulate_run_args, in_notebook, run_with_pty from llama_stack.distribution.utils.exec import formulate_run_args, run_with_pty
from llama_stack.distribution.utils.image_types import ImageType from llama_stack.distribution.utils.image_types import ImageType
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
@ -65,8 +65,6 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
if args.image_type == "venv": if args.image_type == "venv":
current_venv = os.environ.get("VIRTUAL_ENV") current_venv = os.environ.get("VIRTUAL_ENV")
image_name = args.image_name or current_venv image_name = args.image_name or current_venv
if not image_name and in_notebook():
image_name = "__system__"
elif args.image_type == "conda": elif args.image_type == "conda":
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
image_name = args.image_name or current_conda_env image_name = args.image_name or current_conda_env
@ -143,7 +141,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
completer=WordCompleter(available_providers), completer=WordCompleter(available_providers),
complete_while_typing=True, complete_while_typing=True,
validator=Validator.from_callable( validator=Validator.from_callable(
lambda x: x in available_providers, lambda x: x in available_providers, # noqa: B023 - see https://github.com/astral-sh/ruff/issues/7847
error_message="Invalid provider, use <TAB> to see options", error_message="Invalid provider, use <TAB> to see options",
), ),
) )
@ -250,7 +248,7 @@ def _generate_run_config(
config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class) config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
if hasattr(config_type, "sample_run_config"): if hasattr(config_type, "sample_run_config"):
config = config_type.sample_run_config(__distro_dir__=f"distributions/{image_name}") config = config_type.sample_run_config(__distro_dir__=f"~/.llama/distributions/{image_name}")
else: else:
config = {} config = {}
@ -291,6 +289,8 @@ def _run_stack_build_command_from_build_config(
if not image_name: if not image_name:
raise ValueError("Please specify an image name when building a conda image") raise ValueError("Please specify an image name when building a conda image")
elif build_config.image_type == ImageType.venv.value: elif build_config.image_type == ImageType.venv.value:
if not image_name and os.environ.get("UV_SYSTEM_PYTHON"):
image_name = "__system__"
if not image_name: if not image_name:
raise ValueError("Please specify an image name when building a venv image") raise ValueError("Please specify an image name when building a venv image")

View file

@ -16,7 +16,7 @@ class StackBuild(Subcommand):
"build", "build",
prog="llama stack build", prog="llama stack build",
description="Build a Llama stack container", description="Build a Llama stack container",
formatter_class=argparse.RawTextHelpFormatter, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
) )
self._add_arguments() self._add_arguments()
self.parser.set_defaults(func=self._run_stack_build_command) self.parser.set_defaults(func=self._run_stack_build_command)
@ -26,7 +26,7 @@ class StackBuild(Subcommand):
"--config", "--config",
type=str, type=str,
default=None, default=None,
help="Path to a config file to use for the build. You can find example configs in llama_stack/distribution/**/build.yaml. If this argument is not provided, you will be prompted to enter information interactively", help="Path to a config file to use for the build. You can find example configs in llama_stack/distributions/**/build.yaml. If this argument is not provided, you will be prompted to enter information interactively",
) )
self.parser.add_argument( self.parser.add_argument(

View file

@ -5,15 +5,15 @@
# the root directory of this source tree. # the root directory of this source tree.
import argparse import argparse
import logging
import os import os
from pathlib import Path from pathlib import Path
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
from llama_stack.log import get_logger
REPO_ROOT = Path(__file__).parent.parent.parent.parent REPO_ROOT = Path(__file__).parent.parent.parent.parent
logger = logging.getLogger(__name__) logger = get_logger(name=__name__, category="server")
class StackRun(Subcommand): class StackRun(Subcommand):
@ -23,7 +23,7 @@ class StackRun(Subcommand):
"run", "run",
prog="llama stack run", prog="llama stack run",
description="""Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.""", description="""Start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.""",
formatter_class=argparse.RawTextHelpFormatter, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
) )
self._add_arguments() self._add_arguments()
self.parser.set_defaults(func=self._run_stack_run_cmd) self.parser.set_defaults(func=self._run_stack_run_cmd)
@ -37,12 +37,13 @@ class StackRun(Subcommand):
self.parser.add_argument( self.parser.add_argument(
"--port", "--port",
type=int, type=int,
help="Port to run the server on. Defaults to 8321", help="Port to run the server on. It can also be passed via the env var LLAMA_STACK_PORT.",
default=int(os.getenv("LLAMA_STACK_PORT", 8321)), default=int(os.getenv("LLAMA_STACK_PORT", 8321)),
) )
self.parser.add_argument( self.parser.add_argument(
"--image-name", "--image-name",
type=str, type=str,
default=os.environ.get("CONDA_DEFAULT_ENV"),
help="Name of the image to run. Defaults to the current conda environment", help="Name of the image to run. Defaults to the current conda environment",
) )
self.parser.add_argument( self.parser.add_argument(
@ -73,17 +74,14 @@ class StackRun(Subcommand):
type=str, type=str,
help="Image Type used during the build. This can be either conda or container or venv.", help="Image Type used during the build. This can be either conda or container or venv.",
choices=["conda", "container", "venv"], choices=["conda", "container", "venv"],
default="conda",
) )
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None: def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
import yaml import yaml
from llama_stack.distribution.build import ImageType
from llama_stack.distribution.configure import parse_and_maybe_upgrade_config from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
from llama_stack.distribution.utils.config_dirs import ( from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
BUILDS_BASE_DIR,
DISTRIBS_BASE_DIR,
)
from llama_stack.distribution.utils.exec import formulate_run_args, run_with_pty from llama_stack.distribution.utils.exec import formulate_run_args, run_with_pty
config_file = Path(args.config) config_file = Path(args.config)
@ -96,14 +94,6 @@ class StackRun(Subcommand):
if config_file.exists(): if config_file.exists():
template_name = args.config template_name = args.config
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to conda dir
config_file = Path(BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml")
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to container dir
config_file = Path(BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml")
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir # check if it's a build config saved to ~/.llama dir
config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml") config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")

View file

@ -7,6 +7,7 @@
import argparse import argparse
from importlib.metadata import version from importlib.metadata import version
from llama_stack.cli.stack.utils import print_subcommand_description
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
from .build import StackBuild from .build import StackBuild
@ -22,6 +23,7 @@ class StackParser(Subcommand):
"stack", "stack",
prog="llama stack", prog="llama stack",
description="Operations for the Llama Stack / Distributions", description="Operations for the Llama Stack / Distributions",
formatter_class=argparse.RawTextHelpFormatter,
) )
self.parser.add_argument( self.parser.add_argument(
@ -39,3 +41,5 @@ class StackParser(Subcommand):
StackListApis.create(subparsers) StackListApis.create(subparsers)
StackListProviders.create(subparsers) StackListProviders.create(subparsers)
StackRun.create(subparsers) StackRun.create(subparsers)
print_subcommand_description(self.parser, subparsers)

View file

@ -0,0 +1,14 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
def print_subcommand_description(parser, subparsers):
"""Print descriptions of subcommands."""
description_text = ""
for name, subcommand in subparsers.choices.items():
description = subcommand.description
description_text += f" {name:<21} {description}\n"
parser.epilog = description_text

View file

@ -15,7 +15,6 @@ from termcolor import cprint
from llama_stack.distribution.datatypes import BuildConfig, Provider from llama_stack.distribution.datatypes import BuildConfig, Provider
from llama_stack.distribution.distribution import get_provider_registry from llama_stack.distribution.distribution import get_provider_registry
from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
from llama_stack.distribution.utils.exec import run_command, run_with_pty from llama_stack.distribution.utils.exec import run_command, run_with_pty
from llama_stack.distribution.utils.image_types import ImageType from llama_stack.distribution.utils.image_types import ImageType
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
@ -103,8 +102,6 @@ def build_image(
template_or_config, template_or_config,
image_name, image_name,
container_base, container_base,
str(build_file_path),
str(BUILDS_BASE_DIR / ImageType.container.value),
" ".join(normal_deps), " ".join(normal_deps),
] ]
elif build_config.image_type == ImageType.conda.value: elif build_config.image_type == ImageType.conda.value:

View file

@ -6,8 +6,8 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -16,8 +16,8 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
if [ -n "$LLAMA_STACK_DIR" ]; then if [ -n "$LLAMA_STACK_DIR" ]; then
echo "Using llama-stack-dir=$LLAMA_STACK_DIR" echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
fi fi
if [ -n "$LLAMA_MODELS_DIR" ]; then if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
echo "Using llama-models-dir=$LLAMA_MODELS_DIR" echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
fi fi
if [ "$#" -lt 3 ]; then if [ "$#" -lt 3 ]; then
@ -52,7 +52,7 @@ ensure_conda_env_python310() {
local python_version="3.10" local python_version="3.10"
# Check if conda command is available # Check if conda command is available
if ! command -v conda &>/dev/null; then if ! is_command_available conda; then
printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2 printf "${RED}Error: conda command not found. Is Conda installed and in your PATH?${NC}" >&2
exit 1 exit 1
fi fi
@ -87,8 +87,6 @@ ensure_conda_env_python310() {
# these packages are damaged in test-pypi, so install them first # these packages are damaged in test-pypi, so install them first
uv pip install fastapi libcst uv pip install fastapi libcst
uv pip install --extra-index-url https://test.pypi.org/simple/ \ uv pip install --extra-index-url https://test.pypi.org/simple/ \
llama-models==$TEST_PYPI_VERSION \
llama-stack-client==$TEST_PYPI_VERSION \
llama-stack==$TEST_PYPI_VERSION \ llama-stack==$TEST_PYPI_VERSION \
$pip_dependencies $pip_dependencies
if [ -n "$special_pip_deps" ]; then if [ -n "$special_pip_deps" ]; then
@ -111,22 +109,21 @@ ensure_conda_env_python310() {
else else
PYPI_VERSION="${PYPI_VERSION:-}" PYPI_VERSION="${PYPI_VERSION:-}"
if [ -n "$PYPI_VERSION" ]; then if [ -n "$PYPI_VERSION" ]; then
SPEC_VERSION="llama-stack==${PYPI_VERSION} llama-models==${PYPI_VERSION} llama-stack-client==${PYPI_VERSION}" SPEC_VERSION="llama-stack==${PYPI_VERSION}"
else else
SPEC_VERSION="llama-stack" SPEC_VERSION="llama-stack"
fi fi
uv pip install --no-cache-dir $SPEC_VERSION uv pip install --no-cache-dir $SPEC_VERSION
fi fi
if [ -n "$LLAMA_MODELS_DIR" ]; then if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
if [ ! -d "$LLAMA_MODELS_DIR" ]; then if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
printf "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: $LLAMA_MODELS_DIR${NC}\n" >&2 printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: $LLAMA_STACK_CLIENT_DIR${NC}\n" >&2
exit 1 exit 1
fi fi
printf "Installing from LLAMA_MODELS_DIR: $LLAMA_MODELS_DIR\n" printf "Installing from LLAMA_STACK_CLIENT_DIR: $LLAMA_STACK_CLIENT_DIR\n"
uv pip uninstall llama-models uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
uv pip install --no-cache-dir -e "$LLAMA_MODELS_DIR"
fi fi
# Install pip dependencies # Install pip dependencies

View file

@ -1,4 +1,4 @@
#!/bin/bash #!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates. # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved. # All rights reserved.
@ -6,7 +6,6 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-} LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
@ -20,26 +19,27 @@ UV_HTTP_TIMEOUT=${UV_HTTP_TIMEOUT:-500}
# mounting is not supported by docker buildx, so we use COPY instead # mounting is not supported by docker buildx, so we use COPY instead
USE_COPY_NOT_MOUNT=${USE_COPY_NOT_MOUNT:-} USE_COPY_NOT_MOUNT=${USE_COPY_NOT_MOUNT:-}
if [ "$#" -lt 6 ]; then if [ "$#" -lt 4 ]; then
# This only works for templates # This only works for templates
echo "Usage: $0 <template_or_config> <image_name> <container_base> <build_file_path> <host_build_dir> <pip_dependencies> [<special_pip_deps>]" >&2 echo "Usage: $0 <template_or_config> <image_name> <container_base> <pip_dependencies> [<special_pip_deps>]" >&2
exit 1 exit 1
fi fi
set -euo pipefail set -euo pipefail
template_or_config="$1" template_or_config="$1"
image_name="$2" shift
container_base="$3" image_name="$1"
build_file_path="$4" shift
host_build_dir="$5" container_base="$1"
pip_dependencies="$6" shift
special_pip_deps="${7:-}" pip_dependencies="$1"
shift
special_pip_deps="${1:-}"
# Define color codes # Define color codes
RED='\033[0;31m' RED='\033[0;31m'
GREEN='\033[0;32m'
NC='\033[0m' # No Color NC='\033[0m' # No Color
CONTAINER_BINARY=${CONTAINER_BINARY:-docker} CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
@ -47,8 +47,10 @@ CONTAINER_OPTS=${CONTAINER_OPTS:-}
TEMP_DIR=$(mktemp -d) TEMP_DIR=$(mktemp -d)
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
source "$SCRIPT_DIR/common.sh"
add_to_container() { add_to_container() {
local input
output_file="$TEMP_DIR/Containerfile" output_file="$TEMP_DIR/Containerfile"
if [ -t 0 ]; then if [ -t 0 ]; then
printf '%s\n' "$1" >>"$output_file" printf '%s\n' "$1" >>"$output_file"
@ -58,15 +60,21 @@ add_to_container() {
fi fi
} }
# Check if container command is available
if ! is_command_available $CONTAINER_BINARY; then
printf "${RED}Error: ${CONTAINER_BINARY} command not found. Is ${CONTAINER_BINARY} installed and in your PATH?${NC}" >&2
exit 1
fi
# Update and install UBI9 components if UBI9 base image is used # Update and install UBI9 components if UBI9 base image is used
if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
add_to_container << EOF add_to_container << EOF
FROM $container_base FROM $container_base
WORKDIR /app WORKDIR /app
RUN microdnf -y update && microdnf install -y iputils net-tools wget \ RUN dnf -y update && dnf install -y iputils net-tools wget \
vim-minimal python3.11 python3.11-pip python3.11-wheel \ vim-minimal python3.11 python3.11-pip python3.11-wheel \
python3.11-setuptools && ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && microdnf clean all python3.11-setuptools && ln -s /bin/pip3.11 /bin/pip && ln -s /bin/python3.11 /bin/python && dnf clean all
ENV UV_SYSTEM_PYTHON=1 ENV UV_SYSTEM_PYTHON=1
RUN pip install uv RUN pip install uv
@ -107,7 +115,6 @@ EOF
fi fi
stack_mount="/app/llama-stack-source" stack_mount="/app/llama-stack-source"
models_mount="/app/llama-models-source"
client_mount="/app/llama-stack-client-source" client_mount="/app/llama-stack-client-source"
install_local_package() { install_local_package() {
@ -131,10 +138,6 @@ EOF
} }
if [ -n "$LLAMA_MODELS_DIR" ]; then
install_local_package "$LLAMA_MODELS_DIR" "$models_mount" "LLAMA_MODELS_DIR"
fi
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
install_local_package "$LLAMA_STACK_CLIENT_DIR" "$client_mount" "LLAMA_STACK_CLIENT_DIR" install_local_package "$LLAMA_STACK_CLIENT_DIR" "$client_mount" "LLAMA_STACK_CLIENT_DIR"
fi fi
@ -150,12 +153,12 @@ EOF
add_to_container << EOF add_to_container << EOF
RUN uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \ RUN uv pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
--index-strategy unsafe-best-match \ --index-strategy unsafe-best-match \
llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
EOF EOF
else else
if [ -n "$PYPI_VERSION" ]; then if [ -n "$PYPI_VERSION" ]; then
SPEC_VERSION="llama-stack==${PYPI_VERSION} llama-models==${PYPI_VERSION} llama-stack-client==${PYPI_VERSION}" SPEC_VERSION="llama-stack==${PYPI_VERSION}"
else else
SPEC_VERSION="llama-stack" SPEC_VERSION="llama-stack"
fi fi
@ -165,6 +168,11 @@ EOF
fi fi
fi fi
# remove uv after installation
add_to_container << EOF
RUN pip uninstall -y uv
EOF
# if template_or_config ends with .yaml, it is not a template and we should not use the --template flag # if template_or_config ends with .yaml, it is not a template and we should not use the --template flag
if [[ "$template_or_config" != *.yaml ]]; then if [[ "$template_or_config" != *.yaml ]]; then
add_to_container << EOF add_to_container << EOF
@ -185,26 +193,28 @@ RUN mkdir -p /.llama /.cache
RUN chmod -R g+rw /app /.llama /.cache RUN chmod -R g+rw /app /.llama /.cache
EOF EOF
printf "Containerfile created successfully in $TEMP_DIR/Containerfile\n\n" printf "Containerfile created successfully in %s/Containerfile\n\n" "$TEMP_DIR"
cat $TEMP_DIR/Containerfile cat "$TEMP_DIR"/Containerfile
printf "\n" printf "\n"
mounts="" # Start building the CLI arguments
CLI_ARGS=()
# Read CONTAINER_OPTS and put it in an array
read -ra CLI_ARGS <<< "$CONTAINER_OPTS"
if [ "$USE_COPY_NOT_MOUNT" != "true" ]; then if [ "$USE_COPY_NOT_MOUNT" != "true" ]; then
if [ -n "$LLAMA_STACK_DIR" ]; then if [ -n "$LLAMA_STACK_DIR" ]; then
mounts="$mounts -v $(readlink -f $LLAMA_STACK_DIR):$stack_mount" CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_DIR"):$stack_mount")
fi
if [ -n "$LLAMA_MODELS_DIR" ]; then
mounts="$mounts -v $(readlink -f $LLAMA_MODELS_DIR):$models_mount"
fi fi
if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
mounts="$mounts -v $(readlink -f $LLAMA_STACK_CLIENT_DIR):$client_mount" CLI_ARGS+=("-v" "$(readlink -f "$LLAMA_STACK_CLIENT_DIR"):$client_mount")
fi fi
fi fi
if command -v selinuxenabled &>/dev/null && selinuxenabled; then if is_command_available selinuxenabled && selinuxenabled; then
# Disable SELinux labels -- we don't want to relabel the llama-stack source dir # Disable SELinux labels -- we don't want to relabel the llama-stack source dir
CONTAINER_OPTS="$CONTAINER_OPTS --security-opt label=disable" CLI_ARGS+=("--security-opt" "label=disable")
fi fi
# Set version tag based on PyPI version # Set version tag based on PyPI version
@ -212,7 +222,7 @@ if [ -n "$PYPI_VERSION" ]; then
version_tag="$PYPI_VERSION" version_tag="$PYPI_VERSION"
elif [ -n "$TEST_PYPI_VERSION" ]; then elif [ -n "$TEST_PYPI_VERSION" ]; then
version_tag="test-$TEST_PYPI_VERSION" version_tag="test-$TEST_PYPI_VERSION"
elif [[ -n "$LLAMA_STACK_DIR" || -n "$LLAMA_MODELS_DIR" ]]; then elif [[ -n "$LLAMA_STACK_DIR" || -n "$LLAMA_STACK_CLIENT_DIR" ]]; then
version_tag="dev" version_tag="dev"
else else
URL="https://pypi.org/pypi/llama-stack/json" URL="https://pypi.org/pypi/llama-stack/json"
@ -225,11 +235,11 @@ image_tag="$image_name:$version_tag"
# Detect platform architecture # Detect platform architecture
ARCH=$(uname -m) ARCH=$(uname -m)
if [ -n "$BUILD_PLATFORM" ]; then if [ -n "$BUILD_PLATFORM" ]; then
PLATFORM="--platform $BUILD_PLATFORM" CLI_ARGS+=("--platform $BUILD_PLATFORM")
elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
PLATFORM="--platform linux/arm64" CLI_ARGS+=("--platform" "linux/arm64")
elif [ "$ARCH" = "x86_64" ]; then elif [ "$ARCH" = "x86_64" ]; then
PLATFORM="--platform linux/amd64" CLI_ARGS+=("--platform" "linux/amd64")
else else
echo "Unsupported architecture: $ARCH" echo "Unsupported architecture: $ARCH"
exit 1 exit 1
@ -238,8 +248,13 @@ fi
echo "PWD: $(pwd)" echo "PWD: $(pwd)"
echo "Containerfile: $TEMP_DIR/Containerfile" echo "Containerfile: $TEMP_DIR/Containerfile"
set -x set -x
$CONTAINER_BINARY build $CONTAINER_OPTS $PLATFORM -t $image_tag \
-f "$TEMP_DIR/Containerfile" "." $mounts --progress=plain $CONTAINER_BINARY build \
"${CLI_ARGS[@]}" \
-t "$image_tag" \
-f "$TEMP_DIR/Containerfile" \
"." \
--progress=plain
# clean up tmp/configs # clean up tmp/configs
set +x set +x

View file

@ -9,8 +9,8 @@
# TODO: combine this with build_conda_env.sh since it is almost identical # TODO: combine this with build_conda_env.sh since it is almost identical
# the only difference is that we don't do any conda-specific setup # the only difference is that we don't do any conda-specific setup
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
LLAMA_STACK_CLIENT_DIR=${LLAMA_STACK_CLIENT_DIR:-}
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
# This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out # This timeout (in seconds) is necessary when installing PyTorch via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
@ -21,8 +21,8 @@ VIRTUAL_ENV=${VIRTUAL_ENV:-}
if [ -n "$LLAMA_STACK_DIR" ]; then if [ -n "$LLAMA_STACK_DIR" ]; then
echo "Using llama-stack-dir=$LLAMA_STACK_DIR" echo "Using llama-stack-dir=$LLAMA_STACK_DIR"
fi fi
if [ -n "$LLAMA_MODELS_DIR" ]; then if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
echo "Using llama-models-dir=$LLAMA_MODELS_DIR" echo "Using llama-stack-client-dir=$LLAMA_STACK_CLIENT_DIR"
fi fi
if [ "$#" -lt 2 ]; then if [ "$#" -lt 2 ]; then
@ -95,7 +95,7 @@ run() {
# we are building a command line so word splitting is expected # we are building a command line so word splitting is expected
uv pip install --extra-index-url https://test.pypi.org/simple/ \ uv pip install --extra-index-url https://test.pypi.org/simple/ \
--index-strategy unsafe-best-match \ --index-strategy unsafe-best-match \
llama-models=="$TEST_PYPI_VERSION" llama-stack=="$TEST_PYPI_VERSION" \ llama-stack=="$TEST_PYPI_VERSION" \
$pip_dependencies $pip_dependencies
if [ -n "$special_pip_deps" ]; then if [ -n "$special_pip_deps" ]; then
IFS='#' read -ra parts <<<"$special_pip_deps" IFS='#' read -ra parts <<<"$special_pip_deps"
@ -120,15 +120,14 @@ run() {
uv pip install --no-cache-dir llama-stack uv pip install --no-cache-dir llama-stack
fi fi
if [ -n "$LLAMA_MODELS_DIR" ]; then if [ -n "$LLAMA_STACK_CLIENT_DIR" ]; then
if [ ! -d "$LLAMA_MODELS_DIR" ]; then if [ ! -d "$LLAMA_STACK_CLIENT_DIR" ]; then
printf "${RED}Warning: LLAMA_MODELS_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_MODELS_DIR" >&2 printf "${RED}Warning: LLAMA_STACK_CLIENT_DIR is set but directory does not exist: %s${NC}\n" "$LLAMA_STACK_CLIENT_DIR" >&2
exit 1 exit 1
fi fi
printf "Installing from LLAMA_MODELS_DIR: %s\n" "$LLAMA_MODELS_DIR" printf "Installing from LLAMA_STACK_CLIENT_DIR: %s\n" "$LLAMA_STACK_CLIENT_DIR"
uv pip uninstall llama-models uv pip install --no-cache-dir -e "$LLAMA_STACK_CLIENT_DIR"
uv pip install --no-cache-dir -e "$LLAMA_MODELS_DIR"
fi fi
# Install pip dependencies # Install pip dependencies

View file

@ -39,7 +39,7 @@ def configure_single_provider(registry: Dict[str, ProviderSpec], provider: Provi
return Provider( return Provider(
provider_id=provider.provider_id, provider_id=provider.provider_id,
provider_type=provider.provider_type, provider_type=provider.provider_type,
config=cfg.dict(), config=cfg.model_dump(),
) )

View file

@ -1,47 +0,0 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
CONTAINER_OPTS=${CONTAINER_OPTS:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
set -euo pipefail
error_handler() {
echo "Error occurred in script at line: ${1}" >&2
exit 1
}
trap 'error_handler ${LINENO}' ERR
if [ $# -lt 2 ]; then
echo "Usage: $0 <container name> <build file path>"
exit 1
fi
container_image="$1"
host_build_dir="$2"
container_build_dir="/app/builds"
if command -v selinuxenabled &> /dev/null && selinuxenabled; then
# Disable SELinux labels
CONTAINER_OPTS="$CONTAINER_OPTS --security-opt label=disable"
fi
mounts=""
if [ -n "$LLAMA_STACK_DIR" ]; then
mounts="$mounts -v $(readlink -f $LLAMA_STACK_DIR):/app/llama-stack-source"
fi
set -x
$CONTAINER_BINARY run $CONTAINER_OPTS -it \
--entrypoint "/usr/local/bin/llama" \
-v $host_build_dir:$container_build_dir \
$mounts \
$container_image \
stack configure ./llamastack-build.yaml --output-dir $container_build_dir

View file

@ -13,7 +13,7 @@ from llama_stack.providers.datatypes import Api, ProviderSpec
def stack_apis() -> List[Api]: def stack_apis() -> List[Api]:
return [v for v in Api] return list(Api)
class AutoRoutedApiInfo(BaseModel): class AutoRoutedApiInfo(BaseModel):
@ -55,7 +55,7 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
def providable_apis() -> List[Api]: def providable_apis() -> List[Api]:
routing_table_apis = set(x.routing_table_api for x in builtin_automatically_routed_apis()) routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
return [api for api in Api if api not in routing_table_apis and api != Api.inspect] return [api for api in Api if api not in routing_table_apis and api != Api.inspect]

View file

@ -32,7 +32,10 @@ from termcolor import cprint
from llama_stack.distribution.build import print_pip_install_help from llama_stack.distribution.build import print_pip_install_help
from llama_stack.distribution.configure import parse_and_maybe_upgrade_config from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
from llama_stack.distribution.datatypes import Api from llama_stack.distribution.datatypes import Api
from llama_stack.distribution.request_headers import set_request_provider_data from llama_stack.distribution.request_headers import (
preserve_headers_context_async_generator,
request_provider_data_context,
)
from llama_stack.distribution.resolver import ProviderRegistry from llama_stack.distribution.resolver import ProviderRegistry
from llama_stack.distribution.server.endpoints import get_all_api_endpoints from llama_stack.distribution.server.endpoints import get_all_api_endpoints
from llama_stack.distribution.stack import ( from llama_stack.distribution.stack import (
@ -104,7 +107,7 @@ def convert_to_pydantic(annotation: Any, value: Any) -> Any:
logger.warning( logger.warning(
f"Warning: direct client failed to convert parameter {value} into {annotation}: {e}", f"Warning: direct client failed to convert parameter {value} into {annotation}: {e}",
) )
return value raise ValueError(f"Failed to convert parameter {value} into {annotation}: {e}") from e
class LlamaStackAsLibraryClient(LlamaStackClient): class LlamaStackAsLibraryClient(LlamaStackClient):
@ -160,6 +163,9 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
except StopAsyncIteration: except StopAsyncIteration:
pass pass
finally: finally:
pending = asyncio.all_tasks(loop)
if pending:
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
loop.close() loop.close()
return sync_generator() return sync_generator()
@ -262,21 +268,25 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
if not self.endpoint_impls: if not self.endpoint_impls:
raise ValueError("Client not initialized") raise ValueError("Client not initialized")
# Create headers with provider data if available
headers = {}
if self.provider_data: if self.provider_data:
set_request_provider_data({"X-LlamaStack-Provider-Data": json.dumps(self.provider_data)}) headers["X-LlamaStack-Provider-Data"] = json.dumps(self.provider_data)
if stream: # Use context manager for provider data
response = await self._call_streaming( with request_provider_data_context(headers):
cast_to=cast_to, if stream:
options=options, response = await self._call_streaming(
stream_cls=stream_cls, cast_to=cast_to,
) options=options,
else: stream_cls=stream_cls,
response = await self._call_non_streaming( )
cast_to=cast_to, else:
options=options, response = await self._call_non_streaming(
) cast_to=cast_to,
return response options=options,
)
return response
def _find_matching_endpoint(self, method: str, path: str) -> tuple[Any, dict]: def _find_matching_endpoint(self, method: str, path: str) -> tuple[Any, dict]:
"""Find the matching endpoint implementation for a given method and path. """Find the matching endpoint implementation for a given method and path.
@ -324,6 +334,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
await end_trace() await end_trace()
json_content = json.dumps(convert_pydantic_to_json_value(result)) json_content = json.dumps(convert_pydantic_to_json_value(result))
mock_response = httpx.Response( mock_response = httpx.Response(
status_code=httpx.codes.OK, status_code=httpx.codes.OK,
content=json_content.encode("utf-8"), content=json_content.encode("utf-8"),
@ -335,7 +346,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
url=options.url, url=options.url,
params=options.params, params=options.params,
headers=options.headers or {}, headers=options.headers or {},
json=options.json_data, json=convert_pydantic_to_json_value(body),
), ),
) )
response = APIResponse( response = APIResponse(
@ -373,9 +384,11 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
finally: finally:
await end_trace() await end_trace()
# Wrap the generator to preserve context across iterations
wrapped_gen = preserve_headers_context_async_generator(gen())
mock_response = httpx.Response( mock_response = httpx.Response(
status_code=httpx.codes.OK, status_code=httpx.codes.OK,
content=gen(), content=wrapped_gen,
headers={ headers={
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
@ -384,7 +397,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
url=options.url, url=options.url,
params=options.params, params=options.params,
headers=options.headers or {}, headers=options.headers or {},
json=options.json_data, json=convert_pydantic_to_json_value(body),
), ),
) )

View file

@ -4,16 +4,62 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import contextvars
import json import json
import logging import logging
import threading from typing import Any, AsyncGenerator, ContextManager, Dict, Optional, TypeVar
from typing import Any, Dict
from .utils.dynamic import instantiate_class_type from .utils.dynamic import instantiate_class_type
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
_THREAD_LOCAL = threading.local() # Context variable for request provider data
_provider_data_var = contextvars.ContextVar("provider_data", default=None)
class RequestProviderDataContext(ContextManager):
"""Context manager for request provider data"""
def __init__(self, provider_data: Optional[Dict[str, Any]] = None):
self.provider_data = provider_data
self.token = None
def __enter__(self):
# Save the current value and set the new one
self.token = _provider_data_var.set(self.provider_data)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
# Restore the previous value
if self.token is not None:
_provider_data_var.reset(self.token)
T = TypeVar("T")
def preserve_headers_context_async_generator(gen: AsyncGenerator[T, None]) -> AsyncGenerator[T, None]:
"""
Wraps an async generator to preserve request headers context variables across iterations.
This ensures that context variables set during generator creation are
available during each iteration of the generator, even if the original
context manager has exited.
"""
# Capture the current context value right now
context_value = _provider_data_var.get()
async def wrapper():
while True:
# Set context before each anext() call
_ = _provider_data_var.set(context_value)
try:
item = await gen.__anext__()
yield item
except StopAsyncIteration:
break
return wrapper()
class NeedsRequestProviderData: class NeedsRequestProviderData:
@ -26,7 +72,7 @@ class NeedsRequestProviderData:
if not validator_class: if not validator_class:
raise ValueError(f"Provider {provider_type} does not have a validator") raise ValueError(f"Provider {provider_type} does not have a validator")
val = getattr(_THREAD_LOCAL, "provider_data_header_value", None) val = _provider_data_var.get()
if not val: if not val:
return None return None
@ -36,25 +82,32 @@ class NeedsRequestProviderData:
return provider_data return provider_data
except Exception as e: except Exception as e:
log.error(f"Error parsing provider data: {e}") log.error(f"Error parsing provider data: {e}")
return None
def set_request_provider_data(headers: Dict[str, str]): def parse_request_provider_data(headers: Dict[str, str]) -> Optional[Dict[str, Any]]:
"""Parse provider data from request headers"""
keys = [ keys = [
"X-LlamaStack-Provider-Data", "X-LlamaStack-Provider-Data",
"x-llamastack-provider-data", "x-llamastack-provider-data",
] ]
val = None
for key in keys: for key in keys:
val = headers.get(key, None) val = headers.get(key, None)
if val: if val:
break break
if not val: if not val:
return return None
try: try:
val = json.loads(val) return json.loads(val)
except json.JSONDecodeError: except json.JSONDecodeError:
log.error("Provider data not encoded as a JSON object!", val) log.error("Provider data not encoded as a JSON object!")
return return None
_THREAD_LOCAL.provider_data_header_value = val
def request_provider_data_context(headers: Dict[str, str]) -> ContextManager:
"""Context manager that sets request provider data from headers for the duration of the context"""
provider_data = parse_request_provider_data(headers)
return RequestProviderDataContext(provider_data)

View file

@ -5,8 +5,7 @@
# the root directory of this source tree. # the root directory of this source tree.
import importlib import importlib
import inspect import inspect
import logging from typing import Any, Dict, List, Set, Tuple
from typing import Any, Dict, List, Set
from llama_stack.apis.agents import Agents from llama_stack.apis.agents import Agents
from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.benchmarks import Benchmarks
@ -35,6 +34,7 @@ from llama_stack.distribution.datatypes import (
from llama_stack.distribution.distribution import builtin_automatically_routed_apis from llama_stack.distribution.distribution import builtin_automatically_routed_apis
from llama_stack.distribution.store import DistributionRegistry from llama_stack.distribution.store import DistributionRegistry
from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import ( from llama_stack.providers.datatypes import (
Api, Api,
BenchmarksProtocolPrivate, BenchmarksProtocolPrivate,
@ -50,7 +50,7 @@ from llama_stack.providers.datatypes import (
VectorDBsProtocolPrivate, VectorDBsProtocolPrivate,
) )
log = logging.getLogger(__name__) logger = get_logger(name=__name__, category="core")
class InvalidProviderError(Exception): class InvalidProviderError(Exception):
@ -104,60 +104,43 @@ class ProviderWithSpec(Provider):
ProviderRegistry = Dict[Api, Dict[str, ProviderSpec]] ProviderRegistry = Dict[Api, Dict[str, ProviderSpec]]
# TODO: this code is not very straightforward to follow and needs one more round of refactoring
async def resolve_impls( async def resolve_impls(
run_config: StackRunConfig, run_config: StackRunConfig,
provider_registry: ProviderRegistry, provider_registry: ProviderRegistry,
dist_registry: DistributionRegistry, dist_registry: DistributionRegistry,
) -> Dict[Api, Any]: ) -> Dict[Api, Any]:
""" """
Does two things: Resolves provider implementations by:
- flatmaps, sorts and resolves the providers in dependency order 1. Validating and organizing providers.
- for each API, produces either a (local, passthrough or router) implementation 2. Sorting them in dependency order.
3. Instantiating them with required dependencies.
""" """
routing_table_apis = set(x.routing_table_api for x in builtin_automatically_routed_apis()) routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()}
router_apis = set(x.router_api for x in builtin_automatically_routed_apis()) router_apis = {x.router_api for x in builtin_automatically_routed_apis()}
providers_with_specs = {} providers_with_specs = validate_and_prepare_providers(
run_config, provider_registry, routing_table_apis, router_apis
for api_str, providers in run_config.providers.items(): )
api = Api(api_str)
if api in routing_table_apis:
raise ValueError(f"Provider for `{api_str}` is automatically provided and cannot be overridden")
specs = {}
for provider in providers:
if provider.provider_type not in provider_registry[api]:
raise ValueError(f"Provider `{provider.provider_type}` is not available for API `{api}`")
p = provider_registry[api][provider.provider_type]
if p.deprecation_error:
log.error(p.deprecation_error, "red", attrs=["bold"])
raise InvalidProviderError(p.deprecation_error)
elif p.deprecation_warning:
log.warning(
f"Provider `{provider.provider_type}` for API `{api}` is deprecated and will be removed in a future release: {p.deprecation_warning}",
)
p.deps__ = [a.value for a in p.api_dependencies] + [a.value for a in p.optional_api_dependencies]
spec = ProviderWithSpec(
spec=p,
**(provider.model_dump()),
)
specs[provider.provider_id] = spec
key = api_str if api not in router_apis else f"inner-{api_str}"
providers_with_specs[key] = specs
apis_to_serve = run_config.apis or set( apis_to_serve = run_config.apis or set(
list(providers_with_specs.keys()) + [x.value for x in routing_table_apis] + [x.value for x in router_apis] list(providers_with_specs.keys()) + [x.value for x in routing_table_apis] + [x.value for x in router_apis]
) )
providers_with_specs.update(specs_for_autorouted_apis(apis_to_serve))
sorted_providers = sort_providers_by_deps(providers_with_specs, run_config)
return await instantiate_providers(sorted_providers, router_apis, dist_registry)
def specs_for_autorouted_apis(apis_to_serve: List[str] | Set[str]) -> Dict[str, Dict[str, ProviderWithSpec]]:
"""Generates specifications for automatically routed APIs."""
specs = {}
for info in builtin_automatically_routed_apis(): for info in builtin_automatically_routed_apis():
if info.router_api.value not in apis_to_serve: if info.router_api.value not in apis_to_serve:
continue continue
providers_with_specs[info.routing_table_api.value] = { specs[info.routing_table_api.value] = {
"__builtin__": ProviderWithSpec( "__builtin__": ProviderWithSpec(
provider_id="__routing_table__", provider_id="__routing_table__",
provider_type="__routing_table__", provider_type="__routing_table__",
@ -167,12 +150,12 @@ async def resolve_impls(
router_api=info.router_api, router_api=info.router_api,
module="llama_stack.distribution.routers", module="llama_stack.distribution.routers",
api_dependencies=[], api_dependencies=[],
deps__=([f"inner-{info.router_api.value}"]), deps__=[f"inner-{info.router_api.value}"],
), ),
) )
} }
providers_with_specs[info.router_api.value] = { specs[info.router_api.value] = {
"__builtin__": ProviderWithSpec( "__builtin__": ProviderWithSpec(
provider_id="__autorouted__", provider_id="__autorouted__",
provider_type="__autorouted__", provider_type="__autorouted__",
@ -182,12 +165,66 @@ async def resolve_impls(
module="llama_stack.distribution.routers", module="llama_stack.distribution.routers",
routing_table_api=info.routing_table_api, routing_table_api=info.routing_table_api,
api_dependencies=[info.routing_table_api], api_dependencies=[info.routing_table_api],
deps__=([info.routing_table_api.value]), deps__=[info.routing_table_api.value],
), ),
) )
} }
return specs
sorted_providers = topological_sort({k: v.values() for k, v in providers_with_specs.items()})
def validate_and_prepare_providers(
run_config: StackRunConfig, provider_registry: ProviderRegistry, routing_table_apis: Set[Api], router_apis: Set[Api]
) -> Dict[str, Dict[str, ProviderWithSpec]]:
"""Validates providers, handles deprecations, and organizes them into a spec dictionary."""
providers_with_specs: Dict[str, Dict[str, ProviderWithSpec]] = {}
for api_str, providers in run_config.providers.items():
api = Api(api_str)
if api in routing_table_apis:
raise ValueError(f"Provider for `{api_str}` is automatically provided and cannot be overridden")
specs = {}
for provider in providers:
if not provider.provider_id or provider.provider_id == "__disabled__":
logger.warning(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
continue
validate_provider(provider, api, provider_registry)
p = provider_registry[api][provider.provider_type]
p.deps__ = [a.value for a in p.api_dependencies] + [a.value for a in p.optional_api_dependencies]
spec = ProviderWithSpec(spec=p, **provider.model_dump())
specs[provider.provider_id] = spec
key = api_str if api not in router_apis else f"inner-{api_str}"
providers_with_specs[key] = specs
return providers_with_specs
def validate_provider(provider: Provider, api: Api, provider_registry: ProviderRegistry):
"""Validates if the provider is allowed and handles deprecations."""
if provider.provider_type not in provider_registry[api]:
raise ValueError(f"Provider `{provider.provider_type}` is not available for API `{api}`")
p = provider_registry[api][provider.provider_type]
if p.deprecation_error:
logger.error(p.deprecation_error)
raise InvalidProviderError(p.deprecation_error)
elif p.deprecation_warning:
logger.warning(
f"Provider `{provider.provider_type}` for API `{api}` is deprecated and will be removed in a future release: {p.deprecation_warning}",
)
def sort_providers_by_deps(
providers_with_specs: Dict[str, Dict[str, ProviderWithSpec]], run_config: StackRunConfig
) -> List[Tuple[str, ProviderWithSpec]]:
"""Sorts providers based on their dependencies."""
sorted_providers: List[Tuple[str, ProviderWithSpec]] = topological_sort(
{k: list(v.values()) for k, v in providers_with_specs.items()}
)
# Append built-in "inspect" provider
apis = [x[1].spec.api for x in sorted_providers] apis = [x[1].spec.api for x in sorted_providers]
sorted_providers.append( sorted_providers.append(
( (
@ -195,28 +232,32 @@ async def resolve_impls(
ProviderWithSpec( ProviderWithSpec(
provider_id="__builtin__", provider_id="__builtin__",
provider_type="__builtin__", provider_type="__builtin__",
config={ config={"run_config": run_config.model_dump()},
"run_config": run_config.dict(),
},
spec=InlineProviderSpec( spec=InlineProviderSpec(
api=Api.inspect, api=Api.inspect,
provider_type="__builtin__", provider_type="__builtin__",
config_class="llama_stack.distribution.inspect.DistributionInspectConfig", config_class="llama_stack.distribution.inspect.DistributionInspectConfig",
module="llama_stack.distribution.inspect", module="llama_stack.distribution.inspect",
api_dependencies=apis, api_dependencies=apis,
deps__=([x.value for x in apis]), deps__=[x.value for x in apis],
), ),
), ),
) )
) )
log.info(f"Resolved {len(sorted_providers)} providers") logger.debug(f"Resolved {len(sorted_providers)} providers")
for api_str, provider in sorted_providers: for api_str, provider in sorted_providers:
log.info(f" {api_str} => {provider.provider_id}") logger.debug(f" {api_str} => {provider.provider_id}")
log.info("") logger.debug("")
return sorted_providers
impls = {}
inner_impls_by_provider_id = {f"inner-{x.value}": {} for x in router_apis} async def instantiate_providers(
sorted_providers: List[Tuple[str, ProviderWithSpec]], router_apis: Set[Api], dist_registry: DistributionRegistry
) -> Dict:
"""Instantiates providers asynchronously while managing dependencies."""
impls: Dict[Api, Any] = {}
inner_impls_by_provider_id: Dict[str, Dict[str, Any]] = {f"inner-{x.value}": {} for x in router_apis}
for api_str, provider in sorted_providers: for api_str, provider in sorted_providers:
deps = {a: impls[a] for a in provider.spec.api_dependencies} deps = {a: impls[a] for a in provider.spec.api_dependencies}
for a in provider.spec.optional_api_dependencies: for a in provider.spec.optional_api_dependencies:
@ -227,14 +268,9 @@ async def resolve_impls(
if isinstance(provider.spec, RoutingTableProviderSpec): if isinstance(provider.spec, RoutingTableProviderSpec):
inner_impls = inner_impls_by_provider_id[f"inner-{provider.spec.router_api.value}"] inner_impls = inner_impls_by_provider_id[f"inner-{provider.spec.router_api.value}"]
impl = await instantiate_provider( impl = await instantiate_provider(provider, deps, inner_impls, dist_registry)
provider,
deps, if api_str.startswith("inner-"):
inner_impls,
dist_registry,
)
# TODO: ugh slightly redesign this shady looking code
if "inner-" in api_str:
inner_impls_by_provider_id[api_str][provider.provider_id] = impl inner_impls_by_provider_id[api_str][provider.provider_id] = impl
else: else:
api = Api(api_str) api = Api(api_str)
@ -245,7 +281,7 @@ async def resolve_impls(
def topological_sort( def topological_sort(
providers_with_specs: Dict[str, List[ProviderWithSpec]], providers_with_specs: Dict[str, List[ProviderWithSpec]],
) -> List[ProviderWithSpec]: ) -> List[Tuple[str, ProviderWithSpec]]:
def dfs(kv, visited: Set[str], stack: List[str]): def dfs(kv, visited: Set[str], stack: List[str]):
api_str, providers = kv api_str, providers = kv
visited.add(api_str) visited.add(api_str)
@ -261,8 +297,8 @@ def topological_sort(
stack.append(api_str) stack.append(api_str)
visited = set() visited: Set[str] = set()
stack = [] stack: List[str] = []
for api_str, providers in providers_with_specs.items(): for api_str, providers in providers_with_specs.items():
if api_str not in visited: if api_str not in visited:
@ -272,13 +308,14 @@ def topological_sort(
for api_str in stack: for api_str in stack:
for provider in providers_with_specs[api_str]: for provider in providers_with_specs[api_str]:
flattened.append((api_str, provider)) flattened.append((api_str, provider))
return flattened return flattened
# returns a class implementing the protocol corresponding to the Api # returns a class implementing the protocol corresponding to the Api
async def instantiate_provider( async def instantiate_provider(
provider: ProviderWithSpec, provider: ProviderWithSpec,
deps: Dict[str, Any], deps: Dict[Api, Any],
inner_impls: Dict[str, Any], inner_impls: Dict[str, Any],
dist_registry: DistributionRegistry, dist_registry: DistributionRegistry,
): ):
@ -286,8 +323,10 @@ async def instantiate_provider(
additional_protocols = additional_protocols_map() additional_protocols = additional_protocols_map()
provider_spec = provider.spec provider_spec = provider.spec
module = importlib.import_module(provider_spec.module) if not hasattr(provider_spec, "module"):
raise AttributeError(f"ProviderSpec of type {type(provider_spec)} does not have a 'module' attribute")
module = importlib.import_module(provider_spec.module)
args = [] args = []
if isinstance(provider_spec, RemoteProviderSpec): if isinstance(provider_spec, RemoteProviderSpec):
config_type = instantiate_class_type(provider_spec.config_class) config_type = instantiate_class_type(provider_spec.config_class)
@ -350,7 +389,7 @@ def check_protocol_compliance(obj: Any, protocol: Any) -> None:
obj_params = set(obj_sig.parameters) obj_params = set(obj_sig.parameters)
obj_params.discard("self") obj_params.discard("self")
if not (proto_params <= obj_params): if not (proto_params <= obj_params):
log.error(f"Method {name} incompatible proto: {proto_params} vs. obj: {obj_params}") logger.error(f"Method {name} incompatible proto: {proto_params} vs. obj: {obj_params}")
missing_methods.append((name, "signature_mismatch")) missing_methods.append((name, "signature_mismatch"))
else: else:
# Check if the method is actually implemented in the class # Check if the method is actually implemented in the class

View file

@ -51,8 +51,10 @@ from llama_stack.apis.tools import (
ToolRuntime, ToolRuntime,
) )
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import RoutingTable from llama_stack.providers.datatypes import RoutingTable
from llama_stack.providers.utils.inference.prompt_adapter import get_default_tool_prompt_format
logger = get_logger(name=__name__, category="core")
class VectorIORouter(VectorIO): class VectorIORouter(VectorIO):
@ -62,12 +64,15 @@ class VectorIORouter(VectorIO):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing VectorIORouter")
self.routing_table = routing_table self.routing_table = routing_table
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("VectorIORouter.initialize")
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("VectorIORouter.shutdown")
pass pass
async def register_vector_db( async def register_vector_db(
@ -78,6 +83,7 @@ class VectorIORouter(VectorIO):
provider_id: Optional[str] = None, provider_id: Optional[str] = None,
provider_vector_db_id: Optional[str] = None, provider_vector_db_id: Optional[str] = None,
) -> None: ) -> None:
logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
await self.routing_table.register_vector_db( await self.routing_table.register_vector_db(
vector_db_id, vector_db_id,
embedding_model, embedding_model,
@ -92,6 +98,9 @@ class VectorIORouter(VectorIO):
chunks: List[Chunk], chunks: List[Chunk],
ttl_seconds: Optional[int] = None, ttl_seconds: Optional[int] = None,
) -> None: ) -> None:
logger.debug(
f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
)
return await self.routing_table.get_provider_impl(vector_db_id).insert_chunks(vector_db_id, chunks, ttl_seconds) return await self.routing_table.get_provider_impl(vector_db_id).insert_chunks(vector_db_id, chunks, ttl_seconds)
async def query_chunks( async def query_chunks(
@ -100,6 +109,7 @@ class VectorIORouter(VectorIO):
query: InterleavedContent, query: InterleavedContent,
params: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None,
) -> QueryChunksResponse: ) -> QueryChunksResponse:
logger.debug(f"VectorIORouter.query_chunks: {vector_db_id}")
return await self.routing_table.get_provider_impl(vector_db_id).query_chunks(vector_db_id, query, params) return await self.routing_table.get_provider_impl(vector_db_id).query_chunks(vector_db_id, query, params)
@ -110,12 +120,15 @@ class InferenceRouter(Inference):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing InferenceRouter")
self.routing_table = routing_table self.routing_table = routing_table
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("InferenceRouter.initialize")
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("InferenceRouter.shutdown")
pass pass
async def register_model( async def register_model(
@ -126,13 +139,16 @@ class InferenceRouter(Inference):
metadata: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None,
model_type: Optional[ModelType] = None, model_type: Optional[ModelType] = None,
) -> None: ) -> None:
logger.debug(
f"InferenceRouter.register_model: {model_id=} {provider_model_id=} {provider_id=} {metadata=} {model_type=}",
)
await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type) await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type)
async def chat_completion( async def chat_completion(
self, self,
model_id: str, model_id: str,
messages: List[Message], messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = None,
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
tools: Optional[List[ToolDefinition]] = None, tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = None, tool_choice: Optional[ToolChoice] = None,
@ -141,6 +157,11 @@ class InferenceRouter(Inference):
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
tool_config: Optional[ToolConfig] = None, tool_config: Optional[ToolConfig] = None,
) -> AsyncGenerator: ) -> AsyncGenerator:
logger.debug(
f"InferenceRouter.chat_completion: {model_id=}, {stream=}, {messages=}, {tools=}, {tool_config=}, {response_format=}",
)
if sampling_params is None:
sampling_params = SamplingParams()
model = await self.routing_table.get_model(model_id) model = await self.routing_table.get_model(model_id)
if model is None: if model is None:
raise ValueError(f"Model '{model_id}' not found") raise ValueError(f"Model '{model_id}' not found")
@ -159,8 +180,6 @@ class InferenceRouter(Inference):
params["tool_prompt_format"] = tool_prompt_format params["tool_prompt_format"] = tool_prompt_format
tool_config = ToolConfig(**params) tool_config = ToolConfig(**params)
tool_config.tool_prompt_format = tool_config.tool_prompt_format or get_default_tool_prompt_format(model_id)
tools = tools or [] tools = tools or []
if tool_config.tool_choice == ToolChoice.none: if tool_config.tool_choice == ToolChoice.none:
tools = [] tools = []
@ -196,11 +215,16 @@ class InferenceRouter(Inference):
self, self,
model_id: str, model_id: str,
content: InterleavedContent, content: InterleavedContent,
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = None,
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> AsyncGenerator: ) -> AsyncGenerator:
if sampling_params is None:
sampling_params = SamplingParams()
logger.debug(
f"InferenceRouter.completion: {model_id=}, {stream=}, {content=}, {sampling_params=}, {response_format=}",
)
model = await self.routing_table.get_model(model_id) model = await self.routing_table.get_model(model_id)
if model is None: if model is None:
raise ValueError(f"Model '{model_id}' not found") raise ValueError(f"Model '{model_id}' not found")
@ -228,6 +252,7 @@ class InferenceRouter(Inference):
output_dimension: Optional[int] = None, output_dimension: Optional[int] = None,
task_type: Optional[EmbeddingTaskType] = None, task_type: Optional[EmbeddingTaskType] = None,
) -> EmbeddingsResponse: ) -> EmbeddingsResponse:
logger.debug(f"InferenceRouter.embeddings: {model_id}")
model = await self.routing_table.get_model(model_id) model = await self.routing_table.get_model(model_id)
if model is None: if model is None:
raise ValueError(f"Model '{model_id}' not found") raise ValueError(f"Model '{model_id}' not found")
@ -247,12 +272,15 @@ class SafetyRouter(Safety):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing SafetyRouter")
self.routing_table = routing_table self.routing_table = routing_table
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("SafetyRouter.initialize")
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("SafetyRouter.shutdown")
pass pass
async def register_shield( async def register_shield(
@ -262,6 +290,7 @@ class SafetyRouter(Safety):
provider_id: Optional[str] = None, provider_id: Optional[str] = None,
params: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None,
) -> Shield: ) -> Shield:
logger.debug(f"SafetyRouter.register_shield: {shield_id}")
return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params) return await self.routing_table.register_shield(shield_id, provider_shield_id, provider_id, params)
async def run_shield( async def run_shield(
@ -270,6 +299,7 @@ class SafetyRouter(Safety):
messages: List[Message], messages: List[Message],
params: Dict[str, Any] = None, params: Dict[str, Any] = None,
) -> RunShieldResponse: ) -> RunShieldResponse:
logger.debug(f"SafetyRouter.run_shield: {shield_id}")
return await self.routing_table.get_provider_impl(shield_id).run_shield( return await self.routing_table.get_provider_impl(shield_id).run_shield(
shield_id=shield_id, shield_id=shield_id,
messages=messages, messages=messages,
@ -282,12 +312,15 @@ class DatasetIORouter(DatasetIO):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing DatasetIORouter")
self.routing_table = routing_table self.routing_table = routing_table
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("DatasetIORouter.initialize")
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("DatasetIORouter.shutdown")
pass pass
async def get_rows_paginated( async def get_rows_paginated(
@ -297,6 +330,9 @@ class DatasetIORouter(DatasetIO):
page_token: Optional[str] = None, page_token: Optional[str] = None,
filter_condition: Optional[str] = None, filter_condition: Optional[str] = None,
) -> PaginatedRowsResult: ) -> PaginatedRowsResult:
logger.debug(
f"DatasetIORouter.get_rows_paginated: {dataset_id}, rows_in_page={rows_in_page}",
)
return await self.routing_table.get_provider_impl(dataset_id).get_rows_paginated( return await self.routing_table.get_provider_impl(dataset_id).get_rows_paginated(
dataset_id=dataset_id, dataset_id=dataset_id,
rows_in_page=rows_in_page, rows_in_page=rows_in_page,
@ -305,6 +341,7 @@ class DatasetIORouter(DatasetIO):
) )
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None:
logger.debug(f"DatasetIORouter.append_rows: {dataset_id}, {len(rows)} rows")
return await self.routing_table.get_provider_impl(dataset_id).append_rows( return await self.routing_table.get_provider_impl(dataset_id).append_rows(
dataset_id=dataset_id, dataset_id=dataset_id,
rows=rows, rows=rows,
@ -316,12 +353,15 @@ class ScoringRouter(Scoring):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing ScoringRouter")
self.routing_table = routing_table self.routing_table = routing_table
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("ScoringRouter.initialize")
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("ScoringRouter.shutdown")
pass pass
async def score_batch( async def score_batch(
@ -330,6 +370,7 @@ class ScoringRouter(Scoring):
scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
save_results_dataset: bool = False, save_results_dataset: bool = False,
) -> ScoreBatchResponse: ) -> ScoreBatchResponse:
logger.debug(f"ScoringRouter.score_batch: {dataset_id}")
res = {} res = {}
for fn_identifier in scoring_functions.keys(): for fn_identifier in scoring_functions.keys():
score_response = await self.routing_table.get_provider_impl(fn_identifier).score_batch( score_response = await self.routing_table.get_provider_impl(fn_identifier).score_batch(
@ -350,6 +391,7 @@ class ScoringRouter(Scoring):
input_rows: List[Dict[str, Any]], input_rows: List[Dict[str, Any]],
scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, scoring_functions: Dict[str, Optional[ScoringFnParams]] = None,
) -> ScoreResponse: ) -> ScoreResponse:
logger.debug(f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions")
res = {} res = {}
# look up and map each scoring function to its provider impl # look up and map each scoring function to its provider impl
for fn_identifier in scoring_functions.keys(): for fn_identifier in scoring_functions.keys():
@ -367,22 +409,26 @@ class EvalRouter(Eval):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing EvalRouter")
self.routing_table = routing_table self.routing_table = routing_table
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("EvalRouter.initialize")
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("EvalRouter.shutdown")
pass pass
async def run_eval( async def run_eval(
self, self,
benchmark_id: str, benchmark_id: str,
task_config: BenchmarkConfig, benchmark_config: BenchmarkConfig,
) -> Job: ) -> Job:
logger.debug(f"EvalRouter.run_eval: {benchmark_id}")
return await self.routing_table.get_provider_impl(benchmark_id).run_eval( return await self.routing_table.get_provider_impl(benchmark_id).run_eval(
benchmark_id=benchmark_id, benchmark_id=benchmark_id,
task_config=task_config, benchmark_config=benchmark_config,
) )
async def evaluate_rows( async def evaluate_rows(
@ -390,13 +436,14 @@ class EvalRouter(Eval):
benchmark_id: str, benchmark_id: str,
input_rows: List[Dict[str, Any]], input_rows: List[Dict[str, Any]],
scoring_functions: List[str], scoring_functions: List[str],
task_config: BenchmarkConfig, benchmark_config: BenchmarkConfig,
) -> EvaluateResponse: ) -> EvaluateResponse:
logger.debug(f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows")
return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows( return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows(
benchmark_id=benchmark_id, benchmark_id=benchmark_id,
input_rows=input_rows, input_rows=input_rows,
scoring_functions=scoring_functions, scoring_functions=scoring_functions,
task_config=task_config, benchmark_config=benchmark_config,
) )
async def job_status( async def job_status(
@ -404,6 +451,7 @@ class EvalRouter(Eval):
benchmark_id: str, benchmark_id: str,
job_id: str, job_id: str,
) -> Optional[JobStatus]: ) -> Optional[JobStatus]:
logger.debug(f"EvalRouter.job_status: {benchmark_id}, {job_id}")
return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id) return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id)
async def job_cancel( async def job_cancel(
@ -411,6 +459,7 @@ class EvalRouter(Eval):
benchmark_id: str, benchmark_id: str,
job_id: str, job_id: str,
) -> None: ) -> None:
logger.debug(f"EvalRouter.job_cancel: {benchmark_id}, {job_id}")
await self.routing_table.get_provider_impl(benchmark_id).job_cancel( await self.routing_table.get_provider_impl(benchmark_id).job_cancel(
benchmark_id, benchmark_id,
job_id, job_id,
@ -421,6 +470,7 @@ class EvalRouter(Eval):
benchmark_id: str, benchmark_id: str,
job_id: str, job_id: str,
) -> EvaluateResponse: ) -> EvaluateResponse:
logger.debug(f"EvalRouter.job_result: {benchmark_id}, {job_id}")
return await self.routing_table.get_provider_impl(benchmark_id).job_result( return await self.routing_table.get_provider_impl(benchmark_id).job_result(
benchmark_id, benchmark_id,
job_id, job_id,
@ -433,6 +483,7 @@ class ToolRuntimeRouter(ToolRuntime):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
self.routing_table = routing_table self.routing_table = routing_table
async def query( async def query(
@ -441,7 +492,8 @@ class ToolRuntimeRouter(ToolRuntime):
vector_db_ids: List[str], vector_db_ids: List[str],
query_config: Optional[RAGQueryConfig] = None, query_config: Optional[RAGQueryConfig] = None,
) -> RAGQueryResult: ) -> RAGQueryResult:
return await self.routing_table.get_provider_impl("query_from_memory").query( logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_db_ids}")
return await self.routing_table.get_provider_impl("knowledge_search").query(
content, vector_db_ids, query_config content, vector_db_ids, query_config
) )
@ -451,6 +503,9 @@ class ToolRuntimeRouter(ToolRuntime):
vector_db_id: str, vector_db_id: str,
chunk_size_in_tokens: int = 512, chunk_size_in_tokens: int = 512,
) -> None: ) -> None:
logger.debug(
f"ToolRuntimeRouter.RagToolImpl.insert: {vector_db_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
)
return await self.routing_table.get_provider_impl("insert_into_memory").insert( return await self.routing_table.get_provider_impl("insert_into_memory").insert(
documents, vector_db_id, chunk_size_in_tokens documents, vector_db_id, chunk_size_in_tokens
) )
@ -459,6 +514,7 @@ class ToolRuntimeRouter(ToolRuntime):
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
logger.debug("Initializing ToolRuntimeRouter")
self.routing_table = routing_table self.routing_table = routing_table
# HACK ALERT this should be in sync with "get_all_api_endpoints()" # HACK ALERT this should be in sync with "get_all_api_endpoints()"
@ -467,12 +523,15 @@ class ToolRuntimeRouter(ToolRuntime):
setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method)) setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
async def initialize(self) -> None: async def initialize(self) -> None:
logger.debug("ToolRuntimeRouter.initialize")
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
logger.debug("ToolRuntimeRouter.shutdown")
pass pass
async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> Any: async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> Any:
logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
return await self.routing_table.get_provider_impl(tool_name).invoke_tool( return await self.routing_table.get_provider_impl(tool_name).invoke_tool(
tool_name=tool_name, tool_name=tool_name,
kwargs=kwargs, kwargs=kwargs,
@ -481,4 +540,5 @@ class ToolRuntimeRouter(ToolRuntime):
async def list_runtime_tools( async def list_runtime_tools(
self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None self, tool_group_id: Optional[str] = None, mcp_endpoint: Optional[URL] = None
) -> List[ToolDef]: ) -> List[ToolDef]:
logger.debug(f"ToolRuntimeRouter.list_runtime_tools: {tool_group_id}")
return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint) return await self.routing_table.get_provider_impl(tool_group_id).list_tools(tool_group_id, mcp_endpoint)

View file

@ -309,23 +309,17 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
if provider_vector_db_id is None: if provider_vector_db_id is None:
provider_vector_db_id = vector_db_id provider_vector_db_id = vector_db_id
if provider_id is None: if provider_id is None:
# If provider_id not specified, use the only provider if it supports this shield type if len(self.impls_by_provider_id) > 0:
if len(self.impls_by_provider_id) == 1:
provider_id = list(self.impls_by_provider_id.keys())[0] provider_id = list(self.impls_by_provider_id.keys())[0]
if len(self.impls_by_provider_id) > 1:
logger.warning(
f"No provider specified and multiple providers available. Arbitrarily selected the first provider {provider_id}."
)
else: else:
raise ValueError( raise ValueError("No provider available. Please configure a vector_io provider.")
"No provider specified and multiple providers available. Please specify a provider_id."
)
model = await self.get_object_by_identifier("model", embedding_model) model = await self.get_object_by_identifier("model", embedding_model)
if model is None: if model is None:
if embedding_model == "all-MiniLM-L6-v2": raise ValueError(f"Model {embedding_model} not found")
raise ValueError(
"Embeddings are now served via Inference providers. "
"Please upgrade your run.yaml to include inline::sentence-transformer as an additional inference provider. "
"See https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/together/run.yaml for an example."
)
else:
raise ValueError(f"Model {embedding_model} not found")
if model.model_type != ModelType.embedding: if model.model_type != ModelType.embedding:
raise ValueError(f"Model {embedding_model} is not an embedding model") raise ValueError(f"Model {embedding_model} is not an embedding model")
if "embedding_dimension" not in model.metadata: if "embedding_dimension" not in model.metadata:
@ -373,7 +367,7 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
provider_id = list(self.impls_by_provider_id.keys())[0] provider_id = list(self.impls_by_provider_id.keys())[0]
else: else:
raise ValueError( raise ValueError(
"No provider specified and multiple providers available. Please specify a provider_id." f"No provider specified and multiple providers available. Please specify a provider_id. Available providers: {self.impls_by_provider_id.keys()}"
) )
if metadata is None: if metadata is None:
metadata = {} metadata = {}

View file

@ -9,7 +9,6 @@ import asyncio
import functools import functools
import inspect import inspect
import json import json
import logging
import os import os
import signal import signal
import sys import sys
@ -26,12 +25,14 @@ from fastapi import Path as FastapiPath
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel, ValidationError from pydantic import BaseModel, ValidationError
from termcolor import cprint
from typing_extensions import Annotated from typing_extensions import Annotated
from llama_stack.distribution.datatypes import StackRunConfig from llama_stack.distribution.datatypes import StackRunConfig
from llama_stack.distribution.distribution import builtin_automatically_routed_apis from llama_stack.distribution.distribution import builtin_automatically_routed_apis
from llama_stack.distribution.request_headers import set_request_provider_data from llama_stack.distribution.request_headers import (
preserve_headers_context_async_generator,
request_provider_data_context,
)
from llama_stack.distribution.resolver import InvalidProviderError from llama_stack.distribution.resolver import InvalidProviderError
from llama_stack.distribution.stack import ( from llama_stack.distribution.stack import (
construct_stack, construct_stack,
@ -39,6 +40,7 @@ from llama_stack.distribution.stack import (
replace_env_vars, replace_env_vars,
validate_env_pair, validate_env_pair,
) )
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
from llama_stack.providers.inline.telemetry.meta_reference.config import TelemetryConfig from llama_stack.providers.inline.telemetry.meta_reference.config import TelemetryConfig
from llama_stack.providers.inline.telemetry.meta_reference.telemetry import ( from llama_stack.providers.inline.telemetry.meta_reference.telemetry import (
@ -54,8 +56,7 @@ from .endpoints import get_all_api_endpoints
REPO_ROOT = Path(__file__).parent.parent.parent.parent REPO_ROOT = Path(__file__).parent.parent.parent.parent
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(asctime)s %(name)s:%(lineno)d: %(message)s") logger = get_logger(name=__name__, category="server")
logger = logging.getLogger(__name__)
def warn_with_traceback(message, category, filename, lineno, file=None, line=None): def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
@ -204,15 +205,14 @@ async def maybe_await(value):
async def sse_generator(event_gen): async def sse_generator(event_gen):
try: try:
event_gen = await event_gen async for item in await event_gen:
async for item in event_gen:
yield create_sse_event(item) yield create_sse_event(item)
await asyncio.sleep(0.01) await asyncio.sleep(0.01)
except asyncio.CancelledError: except asyncio.CancelledError:
print("Generator cancelled") logger.info("Generator cancelled")
await event_gen.aclose() await event_gen.aclose()
except Exception as e: except Exception as e:
traceback.print_exception(e) logger.exception("Error in sse_generator")
yield create_sse_event( yield create_sse_event(
{ {
"error": { "error": {
@ -224,18 +224,20 @@ async def sse_generator(event_gen):
def create_dynamic_typed_route(func: Any, method: str, route: str): def create_dynamic_typed_route(func: Any, method: str, route: str):
async def endpoint(request: Request, **kwargs): async def endpoint(request: Request, **kwargs):
set_request_provider_data(request.headers) # Use context manager for request provider data
with request_provider_data_context(request.headers):
is_streaming = is_streaming_request(func.__name__, request, **kwargs)
is_streaming = is_streaming_request(func.__name__, request, **kwargs) try:
try: if is_streaming:
if is_streaming: gen = preserve_headers_context_async_generator(sse_generator(func(**kwargs)))
return StreamingResponse(sse_generator(func(**kwargs)), media_type="text/event-stream") return StreamingResponse(gen, media_type="text/event-stream")
else: else:
value = func(**kwargs) value = func(**kwargs)
return await maybe_await(value) return await maybe_await(value)
except Exception as e: except Exception as e:
traceback.print_exception(e) logger.exception("Error executing endpoint %s", method, route)
raise translate_exception(e) from e raise translate_exception(e) from e
sig = inspect.signature(func) sig = inspect.signature(func)
@ -433,11 +435,8 @@ def main():
) )
) )
logger.info(f"Serving API {api_str}") logger.debug(f"serving APIs: {apis_to_serve}")
for endpoint in endpoints:
cprint(f" {endpoint.method.upper()} {endpoint.route}", "white")
print("")
app.exception_handler(RequestValidationError)(global_exception_handler) app.exception_handler(RequestValidationError)(global_exception_handler)
app.exception_handler(Exception)(global_exception_handler) app.exception_handler(Exception)(global_exception_handler)
signal.signal(signal.SIGINT, functools.partial(handle_signal, app)) signal.signal(signal.SIGINT, functools.partial(handle_signal, app))

View file

@ -5,13 +5,12 @@
# the root directory of this source tree. # the root directory of this source tree.
import importlib.resources import importlib.resources
import logging
import os import os
import re import re
import tempfile
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import yaml import yaml
from termcolor import colored
from llama_stack.apis.agents import Agents from llama_stack.apis.agents import Agents
from llama_stack.apis.batch_inference import BatchInference from llama_stack.apis.batch_inference import BatchInference
@ -33,13 +32,15 @@ from llama_stack.apis.telemetry import Telemetry
from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
from llama_stack.apis.vector_dbs import VectorDBs from llama_stack.apis.vector_dbs import VectorDBs
from llama_stack.apis.vector_io import VectorIO from llama_stack.apis.vector_io import VectorIO
from llama_stack.distribution.datatypes import StackRunConfig from llama_stack.distribution.datatypes import Provider, StackRunConfig
from llama_stack.distribution.distribution import get_provider_registry from llama_stack.distribution.distribution import get_provider_registry
from llama_stack.distribution.resolver import ProviderRegistry, resolve_impls from llama_stack.distribution.resolver import ProviderRegistry, resolve_impls
from llama_stack.distribution.store.registry import create_dist_registry from llama_stack.distribution.store.registry import create_dist_registry
from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.log import get_logger
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
log = logging.getLogger(__name__) logger = get_logger(name=__name__, category="core")
class LlamaStack( class LlamaStack(
@ -101,12 +102,10 @@ async def register_resources(run_config: StackRunConfig, impls: Dict[Api, Any]):
objects_to_process = response.data if hasattr(response, "data") else response objects_to_process = response.data if hasattr(response, "data") else response
for obj in objects_to_process: for obj in objects_to_process:
log.info( logger.debug(
f"{rsrc.capitalize()}: {colored(obj.identifier, 'white', attrs=['bold'])} served by {colored(obj.provider_id, 'white', attrs=['bold'])}", f"{rsrc.capitalize()}: {obj.identifier} served by {obj.provider_id}",
) )
log.info("")
class EnvVarError(Exception): class EnvVarError(Exception):
def __init__(self, var_name: str, path: str = ""): def __init__(self, var_name: str, path: str = ""):
@ -155,18 +154,34 @@ def replace_env_vars(config: Any, path: str = "") -> Any:
return result return result
elif isinstance(config, str): elif isinstance(config, str):
pattern = r"\${env\.([A-Z0-9_]+)(?::([^}]*))?}" # Updated pattern to support both default values (:) and conditional values (+)
pattern = r"\${env\.([A-Z0-9_]+)(?:([:\+])([^}]*))?}"
def get_env_var(match): def get_env_var(match):
env_var = match.group(1) env_var = match.group(1)
default_val = match.group(2) operator = match.group(2) # ':' for default, '+' for conditional
value_expr = match.group(3)
value = os.environ.get(env_var) env_value = os.environ.get(env_var)
if not value:
if default_val is None: if operator == ":": # Default value syntax: ${env.FOO:default}
raise EnvVarError(env_var, path) if not env_value:
if value_expr is None:
raise EnvVarError(env_var, path)
else:
value = value_expr
else: else:
value = default_val value = env_value
elif operator == "+": # Conditional value syntax: ${env.FOO+value_if_set}
if env_value:
value = value_expr
else:
# If env var is not set, return empty string for the conditional case
value = ""
else: # No operator case: ${env.FOO}
if not env_value:
raise EnvVarError(env_var, path)
value = env_value
# expand "~" from the values # expand "~" from the values
return os.path.expanduser(value) return os.path.expanduser(value)
@ -215,3 +230,53 @@ def get_stack_run_config_from_template(template: str) -> StackRunConfig:
run_config = yaml.safe_load(path.open()) run_config = yaml.safe_load(path.open())
return StackRunConfig(**replace_env_vars(run_config)) return StackRunConfig(**replace_env_vars(run_config))
def run_config_from_adhoc_config_spec(
adhoc_config_spec: str, provider_registry: Optional[ProviderRegistry] = None
) -> StackRunConfig:
"""
Create an adhoc distribution from a list of API providers.
The list should be of the form "api=provider", e.g. "inference=fireworks". If you have
multiple pairs, separate them with commas or semicolons, e.g. "inference=fireworks,safety=llama-guard,agents=meta-reference"
"""
api_providers = adhoc_config_spec.replace(";", ",").split(",")
provider_registry = provider_registry or get_provider_registry()
distro_dir = tempfile.mkdtemp()
provider_configs_by_api = {}
for api_provider in api_providers:
api_str, provider = api_provider.split("=")
api = Api(api_str)
providers_by_type = provider_registry[api]
provider_spec = providers_by_type.get(provider)
if not provider_spec:
provider_spec = providers_by_type.get(f"inline::{provider}")
if not provider_spec:
provider_spec = providers_by_type.get(f"remote::{provider}")
if not provider_spec:
raise ValueError(
f"Provider {provider} (or remote::{provider} or inline::{provider}) not found for API {api}"
)
# call method "sample_run_config" on the provider spec config class
provider_config_type = instantiate_class_type(provider_spec.config_class)
provider_config = replace_env_vars(provider_config_type.sample_run_config(__distro_dir__=distro_dir))
provider_configs_by_api[api_str] = [
Provider(
provider_id=provider,
provider_type=provider_spec.provider_type,
config=provider_config,
)
]
config = StackRunConfig(
image_name="distro-test",
apis=list(provider_configs_by_api.keys()),
providers=provider_configs_by_api,
)
return config

Some files were not shown because too many files have changed in this diff Show more