Merge branch 'main' into add-mcp-streamable-http-support

This commit is contained in:
Calum Murray 2025-07-18 14:38:54 -04:00 committed by GitHub
commit c715f30e65
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
247 changed files with 9685 additions and 5249 deletions

View file

@ -4,3 +4,9 @@ omit =
*/llama_stack/providers/*
*/llama_stack/templates/*
.venv/*
*/llama_stack/cli/scripts/*
*/llama_stack/ui/*
*/llama_stack/distribution/ui/*
*/llama_stack/strong_typing/*
*/llama_stack/env.py
*/__init__.py

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning @reluctantfuturist @mattf @slekkala1

30
.github/ISSUE_TEMPLATE/tech-debt.yml vendored Normal file
View file

@ -0,0 +1,30 @@
name: 🔧 Tech Debt
description: Something that is functional but should be improved or optimizied
labels: ["tech-debt"]
body:
- type: textarea
id: tech-debt-explanation
attributes:
label: 🤔 What is the technical debt you think should be addressed?
description: >
A clear and concise description of _what_ needs to be addressed - ensure you are describing
constitutes [technical debt](https://en.wikipedia.org/wiki/Technical_debt) and is not a bug
or feature request.
validations:
required: true
- type: textarea
id: tech-debt-motivation
attributes:
label: 💡 What is the benefit of addressing this technical debt?
description: >
A clear and concise description of _why_ this work is needed.
validations:
required: true
- type: textarea
id: other-thoughts
attributes:
label: Other thoughts
description: >
Any thoughts about how this may result in complexity in the codebase, or other trade-offs.

View file

@ -7,3 +7,5 @@ runs:
shell: bash
run: |
docker run -d --name ollama -p 11434:11434 docker.io/leseb/ollama-with-models
echo "Verifying Ollama status..."
timeout 30 bash -c 'while ! curl -s -L http://127.0.0.1:11434; do sleep 1 && echo "."; done'

View file

@ -5,6 +5,10 @@ inputs:
description: The Python version to use
required: false
default: "3.12"
client-version:
description: The llama-stack-client-python version to test against (latest or published)
required: false
default: "latest"
runs:
using: "composite"
steps:
@ -20,8 +24,17 @@ runs:
run: |
uv sync --all-groups
uv pip install ollama faiss-cpu
# always test against the latest version of the client
# TODO: this is not necessarily a good idea. we need to test against both published and latest
# to find out backwards compatibility issues.
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
# Install llama-stack-client-python based on the client-version input
if [ "${{ inputs.client-version }}" = "latest" ]; then
echo "Installing latest llama-stack-client-python from main branch"
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
elif [ "${{ inputs.client-version }}" = "published" ]; then
echo "Installing published llama-stack-client-python from PyPI"
uv pip install llama-stack-client
else
echo "Invalid client-version: ${{ inputs.client-version }}"
exit 1
fi
uv pip install -e .

57
.github/workflows/coverage-badge.yml vendored Normal file
View file

@ -0,0 +1,57 @@
name: Coverage Badge
on:
push:
branches: [ main ]
paths:
- 'llama_stack/**'
- 'tests/unit/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/unit-tests.yml'
- '.github/workflows/coverage-badge.yml' # This workflow
workflow_dispatch:
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Run unit tests
run: |
./scripts/unit-tests.sh
- name: Coverage Badge
uses: tj-actions/coverage-badge-py@1788babcb24544eb5bbb6e0d374df5d1e54e670f # v2.0.4
- name: Verify Changed files
uses: tj-actions/verify-changed-files@a1c6acee9df209257a246f2cc6ae8cb6581c1edf # v20.0.4
id: verify-changed-files
with:
files: coverage.svg
- name: Commit files
if: steps.verify-changed-files.outputs.files_changed == 'true'
run: |
git config --local user.email "github-actions[bot]@users.noreply.github.com"
git config --local user.name "github-actions[bot]"
git add coverage.svg
git commit -m "Updated coverage.svg"
- name: Create Pull Request
if: steps.verify-changed-files.outputs.files_changed == 'true'
uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
with:
token: ${{ secrets.GITHUB_TOKEN }}
title: "ci: [Automatic] Coverage Badge Update"
body: |
This PR updates the coverage badge based on the latest coverage report.
Automatically generated by the [workflow coverage-badge.yaml](.github/workflows/coverage-badge.yaml)
delete-branch: true

View file

@ -1,355 +0,0 @@
name: "Run Llama-stack Tests"
on:
#### Temporarily disable PR runs until tests run as intended within mainline.
#TODO Add this back.
#pull_request_target:
# types: ["opened"]
# branches:
# - 'main'
# paths:
# - 'llama_stack/**/*.py'
# - 'tests/**/*.py'
workflow_dispatch:
inputs:
runner:
description: 'GHA Runner Scale Set label to run workflow on.'
required: true
default: "llama-stack-gha-runner-gpu"
checkout_reference:
description: "The branch, tag, or SHA to checkout"
required: true
default: "main"
debug:
description: 'Run debugging steps?'
required: false
default: "true"
sleep_time:
description: '[DEBUG] sleep time for debugging'
required: true
default: "0"
provider_id:
description: 'ID of your provider'
required: true
default: "meta_reference"
model_id:
description: 'Shorthand name for target model ID (llama_3b or llama_8b)'
required: true
default: "llama_3b"
model_override_3b:
description: 'Specify shorthand model for <llama_3b> '
required: false
default: "Llama3.2-3B-Instruct"
model_override_8b:
description: 'Specify shorthand model for <llama_8b> '
required: false
default: "Llama3.1-8B-Instruct"
env:
# ID used for each test's provider config
PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}"
# Path to model checkpoints within EFS volume
MODEL_CHECKPOINT_DIR: "/data/llama"
# Path to directory to run tests from
TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests"
# Keep track of a list of model IDs that are valid to use within pytest fixture marks
AVAILABLE_MODEL_IDs: "llama_3b llama_8b"
# Shorthand name for model ID, used in pytest fixture marks
MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}"
# Override the `llama_3b` / `llama_8b' models, else use the default.
LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}"
LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}"
# Defines which directories in TESTS_PATH to exclude from the test loop
EXCLUDED_DIRS: "__pycache__"
# Defines the output xml reports generated after a test is run
REPORTS_GEN: ""
jobs:
execute_workflow:
name: Execute workload on Self-Hosted GPU k8s runner
permissions:
pull-requests: write
defaults:
run:
shell: bash
runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }}
if: always()
steps:
##############################
#### INITIAL DEBUG CHECKS ####
##############################
- name: "[DEBUG] Check content of the EFS mount"
id: debug_efs_volume
continue-on-error: true
if: inputs.debug == 'true'
run: |
echo "========= Content of the EFS mount ============="
ls -la ${{ env.MODEL_CHECKPOINT_DIR }}
- name: "[DEBUG] Get runner container OS information"
id: debug_os_info
if: ${{ inputs.debug == 'true' }}
run: |
cat /etc/os-release
- name: "[DEBUG] Print environment variables"
id: debug_env_vars
if: ${{ inputs.debug == 'true' }}
run: |
echo "PROVIDER_ID = ${PROVIDER_ID}"
echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}"
echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}"
echo "MODEL_ID = ${MODEL_ID}"
echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}"
echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}"
echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}"
echo "REPORTS_GEN = ${REPORTS_GEN}"
############################
#### MODEL INPUT CHECKS ####
############################
- name: "Check if env.model_id is valid"
id: check_model_id
run: |
if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then
echo "Model ID '${MODEL_ID}' is valid."
else
echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow."
exit 1
fi
#######################
#### CODE CHECKOUT ####
#######################
- name: "Checkout 'meta-llama/llama-stack' repository"
id: checkout_repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.branch }}
- name: "[DEBUG] Content of the repository after checkout"
id: debug_content_after_checkout
if: ${{ inputs.debug == 'true' }}
run: |
ls -la ${GITHUB_WORKSPACE}
##########################################################
#### OPTIONAL SLEEP DEBUG ####
# #
# Use to "exec" into the test k8s POD and run tests #
# manually to identify what dependencies are being used. #
# #
##########################################################
- name: "[DEBUG] sleep"
id: debug_sleep
if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }}
run: |
sleep ${{ inputs.sleep_time }}
############################
#### UPDATE SYSTEM PATH ####
############################
- name: "Update path: execute"
id: path_update_exec
run: |
# .local/bin is needed for certain libraries installed below to be recognized
# when calling their executable to install sub-dependencies
mkdir -p ${HOME}/.local/bin
echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
#####################################
#### UPDATE CHECKPOINT DIRECTORY ####
#####################################
- name: "Update checkpoint directory"
id: checkpoint_update
run: |
echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE"
if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then
echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV"
elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then
echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV"
else
echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow."
exit 1
fi
- name: "[DEBUG] Checkpoint update check"
id: debug_checkpoint_update
if: ${{ inputs.debug == 'true' }}
run: |
echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}"
##################################
#### DEPENDENCY INSTALLATIONS ####
##################################
- name: "Installing 'apt' required packages"
id: install_apt
run: |
echo "[STEP] Installing 'apt' required packages"
sudo apt update -y
sudo apt install -y python3 python3-pip npm wget
- name: "Installing packages with 'curl'"
id: install_curl
run: |
curl -fsSL https://ollama.com/install.sh | sh
- name: "Installing packages with 'wget'"
id: install_wget
run: |
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0
# Add miniconda3 bin to system path
echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH"
- name: "Installing packages with 'npm'"
id: install_npm_generic
run: |
sudo npm install -g junit-merge
- name: "Installing pip dependencies"
id: install_pip_generic
run: |
echo "[STEP] Installing 'llama-stack' models"
pip install -U pip setuptools
pip install -r requirements.txt
pip install -e .
pip install -U \
torch torchvision \
pytest pytest_asyncio \
fairscale lm-format-enforcer \
zmq chardet pypdf \
pandas sentence_transformers together \
aiosqlite
- name: "Installing packages with conda"
id: install_conda_generic
run: |
conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0
#############################################################
#### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH ####
#############################################################
- name: "Run Tests: Loop"
id: run_tests_loop
working-directory: "${{ github.workspace }}"
run: |
pattern=""
for dir in llama_stack/providers/tests/*; do
if [ -d "$dir" ]; then
dir_name=$(basename "$dir")
if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then
for file in "$dir"/test_*.py; do
test_name=$(basename "$file")
new_file="result-${dir_name}-${test_name}.xml"
if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \
--junitxml="${{ github.workspace }}/${new_file}"; then
echo "Ran test: ${test_name}"
else
echo "Did NOT run test: ${test_name}"
fi
pattern+="${new_file} "
done
fi
fi
done
echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV"
- name: "Test Summary: Merge"
id: test_summary_merge
working-directory: "${{ github.workspace }}"
run: |
echo "Merging the following test result files: ${REPORTS_GEN}"
# Defaults to merging them into 'merged-test-results.xml'
junit-merge ${{ env.REPORTS_GEN }}
############################################
#### AUTOMATIC TESTING ON PULL REQUESTS ####
############################################
#### Run tests ####
- name: "PR - Run Tests"
id: pr_run_tests
working-directory: "${{ github.workspace }}"
if: github.event_name == 'pull_request_target'
run: |
echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}"
# (Optional) Add more tests here.
# Merge test results with 'merged-test-results.xml' from above.
# junit-merge <new-test-results> merged-test-results.xml
#### Create test summary ####
- name: "PR - Test Summary"
id: pr_test_summary_create
if: github.event_name == 'pull_request_target'
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
with:
paths: "${{ github.workspace }}/merged-test-results.xml"
output: test-summary.md
- name: "PR - Upload Test Summary"
id: pr_test_summary_upload
if: github.event_name == 'pull_request_target'
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: test-summary
path: test-summary.md
#### Update PR request ####
- name: "PR - Update comment"
id: pr_update_comment
if: github.event_name == 'pull_request_target'
uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
with:
filePath: test-summary.md
########################
#### MANUAL TESTING ####
########################
#### Run tests ####
- name: "Manual - Run Tests: Prep"
id: manual_run_tests
working-directory: "${{ github.workspace }}"
if: github.event_name == 'workflow_dispatch'
run: |
echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}"
#TODO Use this when collection errors are resolved
# pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml"
# (Optional) Add more tests here.
# Merge test results with 'merged-test-results.xml' from above.
# junit-merge <new-test-results> merged-test-results.xml
#### Create test summary ####
- name: "Manual - Test Summary"
id: manual_test_summary
if: always() && github.event_name == 'workflow_dispatch'
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
with:
paths: "${{ github.workspace }}/merged-test-results.xml"

View file

@ -3,10 +3,10 @@ name: Installer CI
on:
pull_request:
paths:
- 'install.sh'
- 'scripts/install.sh'
push:
paths:
- 'install.sh'
- 'scripts/install.sh'
schedule:
- cron: '0 2 * * *' # every day at 02:00 UTC
@ -16,11 +16,11 @@ jobs:
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
- name: Run ShellCheck on install.sh
run: shellcheck install.sh
run: shellcheck scripts/install.sh
smoke-test:
needs: lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
- name: Run installer end-to-end
run: ./install.sh
run: ./scripts/install.sh

View file

@ -35,7 +35,7 @@ jobs:
- name: Install minikube
if: ${{ matrix.auth-provider == 'kubernetes' }}
uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
uses: medyagh/setup-minikube@e3c7f79eb1e997eabccc536a6cf318a2b0fe19d9 # v0.0.20
- name: Start minikube
if: ${{ matrix.auth-provider == 'oauth2_token' }}
@ -73,9 +73,12 @@ jobs:
server:
port: 8321
EOF
yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}", "token": "${{ env.TOKEN }}"}' -i $run_dir/run.yaml
yq eval '.server.auth.provider_config.type = "${{ matrix.auth-provider }}"' -i $run_dir/run.yaml
yq eval '.server.auth.provider_config.tls_cafile = "${{ env.KUBERNETES_CA_CERT_PATH }}"' -i $run_dir/run.yaml
yq eval '.server.auth.provider_config.issuer = "${{ env.KUBERNETES_ISSUER }}"' -i $run_dir/run.yaml
yq eval '.server.auth.provider_config.audience = "${{ env.KUBERNETES_AUDIENCE }}"' -i $run_dir/run.yaml
yq eval '.server.auth.provider_config.jwks.uri = "${{ env.KUBERNETES_API_SERVER_URL }}"' -i $run_dir/run.yaml
yq eval '.server.auth.provider_config.jwks.token = "${{ env.TOKEN }}"' -i $run_dir/run.yaml
cat $run_dir/run.yaml
nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &

View file

@ -0,0 +1,70 @@
name: SqlStore Integration Tests
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
paths:
- 'llama_stack/providers/utils/sqlstore/**'
- 'tests/integration/sqlstore/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/integration-sql-store-tests.yml' # This workflow
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
test-postgres:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12", "3.13"]
fail-fast: false
services:
postgres:
image: postgres:15
env:
POSTGRES_USER: llamastack
POSTGRES_PASSWORD: llamastack
POSTGRES_DB: llamastack
ports:
- 5432:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
with:
python-version: ${{ matrix.python-version }}
- name: Run SqlStore Integration Tests
env:
ENABLE_POSTGRES_TESTS: "true"
POSTGRES_HOST: localhost
POSTGRES_PORT: 5432
POSTGRES_DB: llamastack
POSTGRES_USER: llamastack
POSTGRES_PASSWORD: llamastack
run: |
uv run pytest -sv tests/integration/providers/utils/sqlstore/
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: postgres-test-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.python-version }}
path: |
*.log
retention-days: 1

View file

@ -7,27 +7,54 @@ on:
branches: [ main ]
paths:
- 'llama_stack/**'
- 'tests/integration/**'
- 'tests/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/integration-tests.yml' # This workflow
- '.github/actions/setup-ollama/action.yml'
schedule:
- cron: '0 0 * * *' # Daily at 12 AM UTC
workflow_dispatch:
inputs:
test-all-client-versions:
description: 'Test against both the latest and published versions'
type: boolean
default: false
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
test-matrix:
discover-tests:
runs-on: ubuntu-latest
outputs:
test-type: ${{ steps.generate-matrix.outputs.test-type }}
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Generate test matrix
id: generate-matrix
run: |
# Get test directories dynamically, excluding non-test directories
TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
grep -Ev "^(__pycache__|fixtures|test_cases)$" |
sort | jq -R -s -c 'split("\n")[:-1]')
echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT
test-matrix:
needs: discover-tests
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
# Listing tests manually since some of them currently fail
# TODO: generate matrix list from tests/integration when fixed
test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime, vector_io]
test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
client-type: [library, server]
python-version: ["3.12", "3.13"]
fail-fast: false # we want to run all tests regardless of failure
client-version: ${{ (github.event_name == 'schedule' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
steps:
- name: Checkout repository
@ -37,6 +64,7 @@ jobs:
uses: ./.github/actions/setup-runner
with:
python-version: ${{ matrix.python-version }}
client-version: ${{ matrix.client-version }}
- name: Setup ollama
uses: ./.github/actions/setup-ollama
@ -53,10 +81,15 @@ jobs:
- name: Run Integration Tests
env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
OLLAMA_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" # for library tests
ENABLE_OLLAMA: "ollama" # for library tests
OLLAMA_INFERENCE_MODEL: "llama3.2:3b-instruct-fp16" # for server tests
ENABLE_OLLAMA: "ollama" # for server tests
OLLAMA_URL: "http://0.0.0.0:11434"
SAFETY_MODEL: "llama-guard3:1b"
LLAMA_STACK_CLIENT_TIMEOUT: "300" # Increased timeout for eval operations
# Use 'shell' to get pipefail behavior
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#exit-codes-and-error-action-preference
# TODO: write a precommit hook to detect if a test contains a pipe but does not use 'shell: bash'
shell: bash
run: |
if [ "${{ matrix.client-type }}" == "library" ]; then
stack_config="starter"
@ -65,8 +98,9 @@ jobs:
fi
uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
-k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
--text-model="ollama/meta-llama/Llama-3.2-3B-Instruct" \
--text-model="ollama/llama3.2:3b-instruct-fp16" \
--embedding-model=all-MiniLM-L6-v2 \
--safety-shield=$SAFETY_MODEL \
--color=yes \
--capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
@ -85,7 +119,7 @@ jobs:
if: ${{ always() }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}
path: |
*.log
retention-days: 1

View file

@ -1,69 +0,0 @@
name: auto-tests
on:
# pull_request:
workflow_dispatch:
inputs:
commit_sha:
description: 'Specific Commit SHA to trigger on'
required: false
default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
jobs:
test-llama-stack-as-library:
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
strategy:
matrix:
provider: [fireworks, together]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ github.event.inputs.commit_sha }}
- name: Echo commit SHA
run: |
echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
git rev-parse HEAD
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt pytest
pip install -e .
- name: Build providers
run: |
llama stack build --template ${{ matrix.provider }} --image-type venv
- name: Install the latest llama-stack-client & llama-models packages
run: |
pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
- name: Run client-sdk test
working-directory: "${{ github.workspace }}"
env:
REPORT_OUTPUT: md_report.md
shell: bash
run: |
pip install --upgrade pytest-md-report
echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
- name: Output reports to the job summary
if: always()
shell: bash
run: |
if [ -f "$REPORT_FILE" ]; then
echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "</details>" >> $GITHUB_STEP_SUMMARY
fi

View file

@ -36,7 +36,7 @@ jobs:
- name: Run unit tests
run: |
PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --junitxml=pytest-report-${{ matrix.python }}.xml
- name: Upload test results
if: always()

View file

@ -29,7 +29,7 @@ repos:
- id: check-toml
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.4
rev: v1.5.5
hooks:
- id: insert-license
files: \.py$|\.sh$
@ -38,7 +38,7 @@ repos:
- docs/license_header.txt
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.4
rev: v0.12.2
hooks:
- id: ruff
args: [ --fix ]
@ -46,14 +46,14 @@ repos:
- id: ruff-format
- repo: https://github.com/adamchainz/blacken-docs
rev: 1.19.0
rev: 1.19.1
hooks:
- id: blacken-docs
additional_dependencies:
- black==24.3.0
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.7.8
rev: 0.7.20
hooks:
- id: uv-lock
- id: uv-export
@ -66,7 +66,7 @@ repos:
]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.15.0
rev: v1.16.1
hooks:
- id: mypy
additional_dependencies:
@ -129,7 +129,28 @@ repos:
require_serial: true
always_run: true
files: ^llama_stack/.*$
- id: forbid-pytest-asyncio
name: Block @pytest.mark.asyncio and @pytest_asyncio.fixture
entry: bash
language: system
types: [python]
pass_filenames: true
args:
- -c
- |
grep -EnH '^[^#]*@pytest\.mark\.asyncio|@pytest_asyncio\.fixture' "$@" && {
echo;
echo "❌ Do not use @pytest.mark.asyncio or @pytest_asyncio.fixture."
echo " pytest is already configured with async-mode=auto."
echo;
exit 1;
} || true
ci:
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
autofix_prs: true
autoupdate_branch: ''
autoupdate_schedule: weekly
skip: []
submodules: false

View file

@ -66,7 +66,7 @@ You can install the dependencies by running:
```bash
cd llama-stack
uv sync --extra dev
uv sync --group dev
uv pip install -e .
source .venv/bin/activate
```
@ -112,7 +112,7 @@ uv run pre-commit run --all-files
## Running tests
You can find the Llama Stack testing documentation here [here](tests/README.md).
You can find the Llama Stack testing documentation [here](https://github.com/meta-llama/llama-stack/blob/main/tests/README.md).
## Adding a new dependency to the project
@ -168,7 +168,7 @@ manually as they are auto-generated.
### Updating the provider documentation
If you have made changes to a provider's configuration, you should run `./scripts/distro_codegen.py`
If you have made changes to a provider's configuration, you should run `./scripts/provider_codegen.py`
to re-generate the documentation. You should not change `docs/source/.../providers/` files manually
as they are auto-generated.
Note that the provider "description" field will be used to generate the provider documentation.

View file

@ -6,6 +6,7 @@
[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
![coverage badge](./coverage.svg)
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
@ -77,7 +78,7 @@ As more providers start supporting Llama 4, you can use them in Llama Stack as w
To try Llama Stack locally, run:
```bash
curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | bash
curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/scripts/install.sh | bash
```
### Overview

21
coverage.svg Normal file
View file

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
<linearGradient id="b" x2="0" y2="100%">
<stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
<stop offset="1" stop-opacity=".1"/>
</linearGradient>
<mask id="a">
<rect width="99" height="20" rx="3" fill="#fff"/>
</mask>
<g mask="url(#a)">
<path fill="#555" d="M0 0h63v20H0z"/>
<path fill="#fe7d37" d="M63 0h36v20H63z"/>
<path fill="url(#b)" d="M0 0h99v20H0z"/>
</g>
<g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
<text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
<text x="31.5" y="14">coverage</text>
<text x="80" y="15" fill="#010101" fill-opacity=".3">44%</text>
<text x="80" y="14">44%</text>
</g>
</svg>

After

Width:  |  Height:  |  Size: 904 B

View file

@ -11132,8 +11132,38 @@
"title": "Trace"
},
"Checkpoint": {
"description": "Checkpoint created during training runs",
"title": "Checkpoint"
"type": "object",
"properties": {
"identifier": {
"type": "string"
},
"created_at": {
"type": "string",
"format": "date-time"
},
"epoch": {
"type": "integer"
},
"post_training_job_id": {
"type": "string"
},
"path": {
"type": "string"
},
"training_metrics": {
"$ref": "#/components/schemas/PostTrainingMetric"
}
},
"additionalProperties": false,
"required": [
"identifier",
"created_at",
"epoch",
"post_training_job_id",
"path"
],
"title": "Checkpoint",
"description": "Checkpoint created during training runs"
},
"PostTrainingJobArtifactsResponse": {
"type": "object",
@ -11156,6 +11186,31 @@
"title": "PostTrainingJobArtifactsResponse",
"description": "Artifacts of a finetuning job."
},
"PostTrainingMetric": {
"type": "object",
"properties": {
"epoch": {
"type": "integer"
},
"train_loss": {
"type": "number"
},
"validation_loss": {
"type": "number"
},
"perplexity": {
"type": "number"
}
},
"additionalProperties": false,
"required": [
"epoch",
"train_loss",
"validation_loss",
"perplexity"
],
"title": "PostTrainingMetric"
},
"PostTrainingJobStatusResponse": {
"type": "object",
"properties": {
@ -11285,6 +11340,9 @@
},
"embedding_dimension": {
"type": "integer"
},
"vector_db_name": {
"type": "string"
}
},
"additionalProperties": false,
@ -13535,10 +13593,6 @@
"provider_id": {
"type": "string",
"description": "The ID of the provider to use for this vector store."
},
"provider_vector_db_id": {
"type": "string",
"description": "The provider-specific vector database ID."
}
},
"additionalProperties": false,
@ -14741,7 +14795,8 @@
"description": "Template for formatting each retrieved chunk in the context. Available placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content string), {metadata} (chunk metadata dict). Default: \"Result {index}\\nContent: {chunk.content}\\nMetadata: {metadata}\\n\""
},
"mode": {
"type": "string",
"$ref": "#/components/schemas/RAGSearchMode",
"default": "vector",
"description": "Search mode for retrieval—either \"vector\", \"keyword\", or \"hybrid\". Default \"vector\"."
},
"ranker": {
@ -14776,6 +14831,16 @@
}
}
},
"RAGSearchMode": {
"type": "string",
"enum": [
"vector",
"keyword",
"hybrid"
],
"title": "RAGSearchMode",
"description": "Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search for semantic matching - KEYWORD: Uses keyword-based search for exact matching - HYBRID: Combines both vector and keyword search for better results"
},
"RRFRanker": {
"type": "object",
"properties": {
@ -15568,6 +15633,10 @@
"type": "string",
"description": "The identifier of the provider."
},
"vector_db_name": {
"type": "string",
"description": "The name of the vector database."
},
"provider_vector_db_id": {
"type": "string",
"description": "The identifier of the vector database in the provider."

View file

@ -7838,8 +7838,30 @@ components:
- start_time
title: Trace
Checkpoint:
description: Checkpoint created during training runs
type: object
properties:
identifier:
type: string
created_at:
type: string
format: date-time
epoch:
type: integer
post_training_job_id:
type: string
path:
type: string
training_metrics:
$ref: '#/components/schemas/PostTrainingMetric'
additionalProperties: false
required:
- identifier
- created_at
- epoch
- post_training_job_id
- path
title: Checkpoint
description: Checkpoint created during training runs
PostTrainingJobArtifactsResponse:
type: object
properties:
@ -7855,6 +7877,24 @@ components:
- checkpoints
title: PostTrainingJobArtifactsResponse
description: Artifacts of a finetuning job.
PostTrainingMetric:
type: object
properties:
epoch:
type: integer
train_loss:
type: number
validation_loss:
type: number
perplexity:
type: number
additionalProperties: false
required:
- epoch
- train_loss
- validation_loss
- perplexity
title: PostTrainingMetric
PostTrainingJobStatusResponse:
type: object
properties:
@ -7944,6 +7984,8 @@ components:
type: string
embedding_dimension:
type: integer
vector_db_name:
type: string
additionalProperties: false
required:
- identifier
@ -9454,10 +9496,6 @@ components:
type: string
description: >-
The ID of the provider to use for this vector store.
provider_vector_db_id:
type: string
description: >-
The provider-specific vector database ID.
additionalProperties: false
required:
- name
@ -10306,7 +10344,8 @@ components:
content string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
{chunk.content}\nMetadata: {metadata}\n"
mode:
type: string
$ref: '#/components/schemas/RAGSearchMode'
default: vector
description: >-
Search mode for retrieval—either "vector", "keyword", or "hybrid". Default
"vector".
@ -10333,6 +10372,17 @@ components:
mapping:
default: '#/components/schemas/DefaultRAGQueryGeneratorConfig'
llm: '#/components/schemas/LLMRAGQueryGeneratorConfig'
RAGSearchMode:
type: string
enum:
- vector
- keyword
- hybrid
title: RAGSearchMode
description: >-
Search modes for RAG query retrieval: - VECTOR: Uses vector similarity search
for semantic matching - KEYWORD: Uses keyword-based search for exact matching
- HYBRID: Combines both vector and keyword search for better results
RRFRanker:
type: object
properties:
@ -10893,6 +10943,9 @@ components:
provider_id:
type: string
description: The identifier of the provider.
vector_db_name:
type: string
description: The name of the vector database.
provider_vector_db_id:
type: string
description: >-

View file

@ -55,7 +55,7 @@
"\n",
"MODEL=\"Llama-4-Scout-17B-16E-Instruct\"\n",
"# get meta url from llama.com\n",
"!uv run --with llama-stackllama model download --source meta --model-id $MODEL --meta-url <META_URL>\n",
"!uv run --with llama-stack llama model download --source meta --model-id $MODEL --meta-url <META_URL>\n",
"\n",
"model_id = f\"meta-llama/{MODEL}\""
]

View file

@ -1,5 +1,7 @@
# The Llama Stack API
*Originally authored Jul 23, 2024*
**Authors:**
* Meta: @raghotham, @ashwinb, @hjshah, @jspisak
@ -24,7 +26,7 @@ Meta releases weights of both the pretrained and instruction fine-tuned Llama mo
### Model Lifecycle
![Figure 1: Model Life Cycle](../docs/resources/model-lifecycle.png)
![Figure 1: Model Life Cycle](resources/model-lifecycle.png)
For each of the operations that need to be performed (e.g. fine tuning, inference, evals etc) during the model life cycle, we identified the capabilities as toolchain APIs that are needed. Some of these capabilities are primitive operations like inference while other capabilities like synthetic data generation are composed of other capabilities. The list of APIs we have identified to support the lifecycle of Llama models is below:
@ -37,7 +39,7 @@ For each of the operations that need to be performed (e.g. fine tuning, inferenc
### Agentic System
![Figure 2: Agentic System](../docs/resources/agentic-system.png)
![Figure 2: Agentic System](resources/agentic-system.png)
In addition to the model lifecycle, we considered the different components involved in an agentic system. Specifically around tool calling and shields. Since the model may decide to call tools, a single model inference call is not enough. Whats needed is an agentic loop consisting of tool calls and inference. The model provides separate tokens representing end-of-message and end-of-turn. A message represents a possible stopping point for execution where the model can inform the execution environment that a tool call needs to be made. The execution environment, upon execution, adds back the result to the context window and makes another inference call. This process can get repeated until an end-of-turn token is generated.
Note that as of today, in the OSS world, such a “loop” is often coded explicitly via elaborate prompt engineering using a ReAct pattern (typically) or preconstructed execution graph. Llama 3.1 (and future Llamas) attempts to absorb this multi-step reasoning loop inside the main model itself.
@ -63,9 +65,9 @@ The sequence diagram that details the steps is [here](https://github.com/meta-ll
We define the Llama Stack as a layer cake shown below.
![Figure 3: Llama Stack](../docs/resources/llama-stack.png)
![Figure 3: Llama Stack](resources/llama-stack.png)
The API is defined in the [YAML](../docs/_static/llama-stack-spec.yaml) and [HTML](../docs/_static/llama-stack-spec.html) files.
The API is defined in the [YAML](_static/llama-stack-spec.yaml) and [HTML](_static/llama-stack-spec.html) files.
## Sample implementations

View file

@ -145,12 +145,12 @@
" del os.environ[\"UV_SYSTEM_PYTHON\"]\n",
"\n",
"# this command installs all the dependencies needed for the llama stack server with the ollama inference provider\n",
"!uv run --with llama-stack llama stack build --template ollama --image-type venv --image-name myvenv\n",
"!uv run --with llama-stack llama stack build --template starter --image-type venv\n",
"\n",
"def run_llama_stack_server_background():\n",
" log_file = open(\"llama_stack_server.log\", \"w\")\n",
" process = subprocess.Popen(\n",
" f\"uv run --with llama-stack llama stack run ollama --image-type venv --image-name myvenv --env INFERENCE_MODEL=llama3.2:3b\",\n",
" f\"uv run --with llama-stack llama stack run starter --image-type venv --env INFERENCE_MODEL=llama3.2:3b\",\n",
" shell=True,\n",
" stdout=log_file,\n",
" stderr=log_file,\n",
@ -249,18 +249,23 @@
],
"source": [
"from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient\n",
"import os\n",
"\n",
"os.environ[\"ENABLE_OLLAMA\"] = \"ollama\"\n",
"os.environ[\"OLLAMA_INFERENCE_MODEL\"] = \"llama3.2:3b\"\n",
"os.environ[\"OLLAMA_EMBEDDING_MODEL\"] = \"all-minilm:l6-v2\"\n",
"os.environ[\"OLLAMA_EMBEDDING_DIMENSION\"] = \"384\"\n",
"\n",
"vector_db_id = \"my_demo_vector_db\"\n",
"client = LlamaStackClient(base_url=\"http://0.0.0.0:8321\")\n",
"\n",
"models = client.models.list()\n",
"\n",
"# Select the first LLM and first embedding models\n",
"model_id = next(m for m in models if m.model_type == \"llm\").identifier\n",
"embedding_model_id = (\n",
" em := next(m for m in models if m.model_type == \"embedding\")\n",
").identifier\n",
"embedding_dimension = em.metadata[\"embedding_dimension\"]\n",
"# Select the first ollama and first ollama's embedding model\n",
"model_id = next(m for m in models if m.model_type == \"llm\" and m.provider_id == \"ollama\").identifier\n",
"embedding_model = next(m for m in models if m.model_type == \"embedding\" and m.provider_id == \"ollama\")\n",
"embedding_model_id = embedding_model.identifier\n",
"embedding_dimension = embedding_model.metadata[\"embedding_dimension\"]\n",
"\n",
"_ = client.vector_dbs.register(\n",
" vector_db_id=vector_db_id,\n",

View file

@ -0,0 +1,6 @@
# Eval Providers
This section contains documentation for all available providers for the **eval** API.
- [inline::meta-reference](inline_meta-reference.md)
- [remote::nvidia](remote_nvidia.md)

View file

@ -0,0 +1,21 @@
# inline::meta-reference
## Description
Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | |
## Sample Configuration
```yaml
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/meta_reference_eval.db
```

View file

@ -0,0 +1,19 @@
# remote::nvidia
## Description
NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `evaluator_url` | `<class 'str'>` | No | http://0.0.0.0:7331 | The url for accessing the evaluator service |
## Sample Configuration
```yaml
evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
```

View file

@ -0,0 +1,33 @@
# Advanced APIs
## Post-training
Fine-tunes a model.
```{toctree}
:maxdepth: 1
post_training/index
```
## Eval
Generates outputs (via Inference or Agents) and perform scoring.
```{toctree}
:maxdepth: 1
eval/index
```
```{include} evaluation_concepts.md
:start-after: ## Evaluation Concepts
```
## Scoring
Evaluates the outputs of the system.
```{toctree}
:maxdepth: 1
scoring/index
```

View file

@ -0,0 +1,7 @@
# Post_Training Providers
This section contains documentation for all available providers for the **post_training** API.
- [inline::huggingface](inline_huggingface.md)
- [inline::torchtune](inline_torchtune.md)
- [remote::nvidia](remote_nvidia.md)

View file

@ -0,0 +1,33 @@
# inline::huggingface
## Description
HuggingFace-based post-training provider for fine-tuning models using the HuggingFace ecosystem.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `device` | `<class 'str'>` | No | cuda | |
| `distributed_backend` | `Literal['fsdp', 'deepspeed'` | No | | |
| `checkpoint_format` | `Literal['full_state', 'huggingface'` | No | huggingface | |
| `chat_template` | `<class 'str'>` | No | |
| `model_specific_config` | `<class 'dict'>` | No | {'trust_remote_code': True, 'attn_implementation': 'sdpa'} | |
| `max_seq_length` | `<class 'int'>` | No | 2048 | |
| `gradient_checkpointing` | `<class 'bool'>` | No | False | |
| `save_total_limit` | `<class 'int'>` | No | 3 | |
| `logging_steps` | `<class 'int'>` | No | 10 | |
| `warmup_ratio` | `<class 'float'>` | No | 0.1 | |
| `weight_decay` | `<class 'float'>` | No | 0.01 | |
| `dataloader_num_workers` | `<class 'int'>` | No | 4 | |
| `dataloader_pin_memory` | `<class 'bool'>` | No | True | |
## Sample Configuration
```yaml
checkpoint_format: huggingface
distributed_backend: null
device: cpu
```

View file

@ -0,0 +1,20 @@
# inline::torchtune
## Description
TorchTune-based post-training provider for fine-tuning and optimizing models using Meta's TorchTune framework.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `torch_seed` | `int \| None` | No | | |
| `checkpoint_format` | `Literal['meta', 'huggingface'` | No | meta | |
## Sample Configuration
```yaml
checkpoint_format: meta
```

View file

@ -0,0 +1,28 @@
# remote::nvidia
## Description
NVIDIA's post-training provider for fine-tuning models on NVIDIA's platform.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `api_key` | `str \| None` | No | | The NVIDIA API key. |
| `dataset_namespace` | `str \| None` | No | default | The NVIDIA dataset namespace. |
| `project_id` | `str \| None` | No | test-example-model@v1 | The NVIDIA project ID. |
| `customizer_url` | `str \| None` | No | | Base URL for the NeMo Customizer API |
| `timeout` | `<class 'int'>` | No | 300 | Timeout for the NVIDIA Post Training API |
| `max_retries` | `<class 'int'>` | No | 3 | Maximum number of retries for the NVIDIA Post Training API |
| `output_model_dir` | `<class 'str'>` | No | test-example-model@v1 | Directory to save the output model |
## Sample Configuration
```yaml
api_key: ${env.NVIDIA_API_KEY:=}
dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}
```

View file

@ -0,0 +1,7 @@
# Scoring Providers
This section contains documentation for all available providers for the **scoring** API.
- [inline::basic](inline_basic.md)
- [inline::braintrust](inline_braintrust.md)
- [inline::llm-as-judge](inline_llm-as-judge.md)

View file

@ -0,0 +1,13 @@
# inline::basic
## Description
Basic scoring provider for simple evaluation metrics and scoring functions.
## Sample Configuration
```yaml
{}
```

View file

@ -0,0 +1,19 @@
# inline::braintrust
## Description
Braintrust scoring provider for evaluation and scoring using the Braintrust platform.
## Configuration
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `openai_api_key` | `str \| None` | No | | The OpenAI API Key |
## Sample Configuration
```yaml
openai_api_key: ${env.OPENAI_API_KEY:=}
```

View file

@ -0,0 +1,13 @@
# inline::llm-as-judge
## Description
LLM-as-judge scoring provider that uses language models to evaluate and score responses.
## Sample Configuration
```yaml
{}
```

View file

@ -1,4 +1,4 @@
# Building AI Applications (Examples)
# AI Application Examples
Llama Stack provides all the building blocks needed to create sophisticated AI applications.
@ -27,4 +27,5 @@ tools
evals
telemetry
safety
playground/index
```

View file

@ -1,4 +1,4 @@
# Llama Stack Playground
## Llama Stack Playground
```{note}
The Llama Stack Playground is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
@ -9,7 +9,7 @@ The Llama Stack Playground is an simple interface which aims to:
- Demo **end-to-end** application code to help users get started to build their own applications
- Provide an **UI** to help users inspect and understand Llama Stack API providers and resources
## Key Features
### Key Features
#### Playground
Interactive pages for users to play with and explore Llama Stack API capabilities.
@ -90,7 +90,7 @@ Interactive pages for users to play with and explore Llama Stack API capabilitie
- Under the hood, it uses Llama Stack's `/<resources>/list` API to get information about each resources.
- Please visit [Core Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html) for more details about the resources.
## Starting the Llama Stack Playground
### Starting the Llama Stack Playground
To start the Llama Stack Playground, run the following commands:

View file

@ -1,31 +1,39 @@
# Why Llama Stack?
## Llama Stack architecture
Building production AI applications today requires solving multiple challenges:
**Infrastructure Complexity**
- Running large language models efficiently requires specialized infrastructure.
- Different deployment scenarios (local development, cloud, edge) need different solutions.
- Moving from development to production often requires significant rework.
**Essential Capabilities**
- Safety guardrails and content filtering are necessary in an enterprise setting.
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
- Nearly any application needs composable multi-step workflows.
- Finally, without monitoring, observability and evaluation, you end up operating in the dark.
**Lack of Flexibility and Choice**
- Directly integrating with multiple providers creates tight coupling.
- Different providers have different APIs and abstractions.
- Changing providers requires significant code changes.
### Our Solution: A Universal Stack
Llama Stack allows you to build different layers of distributions for your AI workloads using various SDKs and API providers.
```{image} ../../_static/llama-stack.png
:alt: Llama Stack
:width: 400px
```
### Benefits of Llama stack
#### Current challenges in custom AI applications
Building production AI applications today requires solving multiple challenges:
**Infrastructure Complexity**
- Running large language models efficiently requires specialized infrastructure.
- Different deployment scenarios (local development, cloud, edge) need different solutions.
- Moving from development to production often requires significant rework.
**Essential Capabilities**
- Safety guardrails and content filtering are necessary in an enterprise setting.
- Just model inference is not enough - Knowledge retrieval and RAG capabilities are required.
- Nearly any application needs composable multi-step workflows.
- Without monitoring, observability and evaluation, you end up operating in the dark.
**Lack of Flexibility and Choice**
- Directly integrating with multiple providers creates tight coupling.
- Different providers have different APIs and abstractions.
- Changing providers requires significant code changes.
#### Our Solution: A Universal Stack
Llama Stack addresses these challenges through a service-oriented, API-first approach:
**Develop Anywhere, Deploy Everywhere**

View file

@ -2,6 +2,10 @@
Given Llama Stack's service-oriented philosophy, a few concepts and workflows arise which may not feel completely natural in the LLM landscape, especially if you are coming with a background in other frameworks.
```{include} architecture.md
:start-after: ## Llama Stack architecture
```
```{include} apis.md
:start-after: ## APIs
```
@ -10,14 +14,10 @@ Given Llama Stack's service-oriented philosophy, a few concepts and workflows ar
:start-after: ## API Providers
```
```{include} resources.md
:start-after: ## Resources
```
```{include} distributions.md
:start-after: ## Distributions
```
```{include} evaluation_concepts.md
:start-after: ## Evaluation Concepts
```{include} resources.md
:start-after: ## Resources
```

View file

@ -52,7 +52,18 @@ extensions = [
"sphinxcontrib.redoc",
"sphinxcontrib.mermaid",
"sphinxcontrib.video",
"sphinx_reredirects"
]
redirects = {
"providers/post_training/index": "../../advanced_apis/post_training/index.html",
"providers/eval/index": "../../advanced_apis/eval/index.html",
"providers/scoring/index": "../../advanced_apis/scoring/index.html",
"playground/index": "../../building_applications/playground/index.html",
"openai/index": "../../providers/index.html#openai-api-compatibility",
"introduction/index": "../concepts/index.html#llama-stack-architecture"
}
myst_enable_extensions = ["colon_fence"]
html_theme = "sphinx_rtd_theme"

View file

@ -0,0 +1,4 @@
# Deployment Examples
```{include} kubernetes_deployment.md
```

View file

@ -1,10 +1,12 @@
# Kubernetes Deployment Guide
## Kubernetes Deployment Guide
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster.
### Prerequisites
In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
Note: You can also deploy the Llama Stack server in an AWS EKS cluster. See [Deploying Llama Stack Server in AWS EKS](#deploying-llama-stack-server-in-aws-eks) for more details.
First, create a local Kubernetes cluster via Kind:
```
@ -217,3 +219,29 @@ Finally, we forward the Kubernetes service to a local port and test some inferen
kubectl port-forward service/llama-stack-service 5000:5000
llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
```
## Deploying Llama Stack Server in AWS EKS
We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster.
Prerequisites:
- Set up an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html).
- Create a [Github OAuth app](https://docs.github.com/en/apps/oauth-apps/building-oauth-apps/creating-an-oauth-app) and get the client ID and client secret.
- Set the `Authorization callback URL` to `http://<your-llama-stack-ui-url>/api/auth/callback/`
Run the following script to deploy the Llama Stack server:
```
export HF_TOKEN=<your-huggingface-token>
export GITHUB_CLIENT_ID=<your-github-client-id>
export GITHUB_CLIENT_SECRET=<your-github-client-secret>
export LLAMA_STACK_UI_URL=<your-llama-stack-ui-url>
cd docs/source/distributions/eks
./apply.sh
```
This script will:
- Set up a default storage class for AWS EKS
- Deploy the Llama Stack server in a Kubernetes Pod and Service

View file

@ -145,6 +145,10 @@ $ llama stack build --template starter
...
You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml`
```
```{tip}
The generated `run.yaml` file is a starting point for your configuration. For comprehensive guidance on customizing it for your specific needs, infrastructure, and deployment scenarios, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
```
:::
:::{tab-item} Building from Scratch
@ -393,17 +397,17 @@ llama stack list
```
```
------------------------------+-----------------------------------------------------------------------------+--------------+------------+
| Stack Name | Path | Build Config | Run Config |
+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
| together | /home/wenzhou/.llama/distributions/together | Yes | No |
+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
| bedrock | /home/wenzhou/.llama/distributions/bedrock | Yes | No |
+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
| starter | /home/wenzhou/.llama/distributions/starter | No | No |
+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
| remote-vllm | /home/wenzhou/.llama/distributions/remote-vllm | Yes | Yes |
+------------------------------+-----------------------------------------------------------------------------+--------------+------------+
------------------------------+-----------------------------------------------------------------+--------------+------------+
| Stack Name | Path | Build Config | Run Config |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| together | ~/.llama/distributions/together | Yes | No |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| bedrock | ~/.llama/distributions/bedrock | Yes | No |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| starter | ~/.llama/distributions/starter | Yes | Yes |
+------------------------------+-----------------------------------------------------------------------------+--------------+
| remote-vllm | ~/.llama/distributions/remote-vllm | Yes | Yes |
+------------------------------+-----------------------------------------------------------------------------+--------------+
```
### Removing a Distribution

View file

@ -2,6 +2,10 @@
The Llama Stack runtime configuration is specified as a YAML file. Here is a simplified version of an example configuration file for the Ollama distribution:
```{note}
The default `run.yaml` files generated by templates are starting points for your configuration. For guidance on customizing these files for your specific needs, see [Customizing Your run.yaml Configuration](customizing_run_yaml.md).
```
```{dropdown} 👋 Click here for a Sample Configuration File
```yaml
@ -56,8 +60,8 @@ shields: []
server:
port: 8321
auth:
provider_type: "oauth2_token"
config:
provider_config:
type: "oauth2_token"
jwks:
uri: "https://my-token-issuing-svc.com/jwks"
```
@ -226,6 +230,8 @@ server:
### Authentication Configuration
> **Breaking Change (v0.2.14)**: The authentication configuration structure has changed. The previous format with `provider_type` and `config` fields has been replaced with a unified `provider_config` field that includes the `type` field. Update your configuration files accordingly.
The `auth` section configures authentication for the server. When configured, all API requests must include a valid Bearer token in the Authorization header:
```
@ -240,8 +246,8 @@ The server can be configured to use service account tokens for authorization, va
```yaml
server:
auth:
provider_type: "oauth2_token"
config:
provider_config:
type: "oauth2_token"
jwks:
uri: "https://kubernetes.default.svc:8443/openid/v1/jwks"
token: "${env.TOKEN:+}"
@ -325,13 +331,25 @@ You can easily validate a request by running:
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers
```
#### GitHub Token Provider
Validates GitHub personal access tokens or OAuth tokens directly:
```yaml
server:
auth:
provider_config:
type: "github_token"
github_api_base_url: "https://api.github.com" # Or GitHub Enterprise URL
```
The provider fetches user information from GitHub and maps it to access attributes based on the `claims_mapping` configuration.
#### Custom Provider
Validates tokens against a custom authentication endpoint:
```yaml
server:
auth:
provider_type: "custom"
config:
provider_config:
type: "custom"
endpoint: "https://auth.example.com/validate" # URL of the auth endpoint
```
@ -416,8 +434,8 @@ clients.
server:
port: 8321
auth:
provider_type: custom
config:
provider_config:
type: custom
endpoint: https://auth.example.com/validate
quota:
kvstore:

View file

@ -0,0 +1,40 @@
# Customizing run.yaml Files
The `run.yaml` files generated by Llama Stack templates are **starting points** designed to be customized for your specific needs. They are not meant to be used as-is in production environments.
## Key Points
- **Templates are starting points**: Generated `run.yaml` files contain defaults for development/testing
- **Customization expected**: Update URLs, credentials, models, and settings for your environment
- **Version control separately**: Keep customized configs in your own repository
- **Environment-specific**: Create different configurations for dev, staging, production
## What You Can Customize
You can customize:
- **Provider endpoints**: Change `http://localhost:8000` to your actual servers
- **Swap providers**: Replace default providers (e.g., swap Tavily with Brave for search)
- **Storage paths**: Move from `/tmp/` to production directories
- **Authentication**: Add API keys, SSL, timeouts
- **Models**: Different model sizes for dev vs prod
- **Database settings**: Switch from SQLite to PostgreSQL
- **Tool configurations**: Add custom tools and integrations
## Best Practices
- Use environment variables for secrets and environment-specific values
- Create separate `run.yaml` files for different environments (dev, staging, prod)
- Document your changes with comments
- Test configurations before deployment
- Keep your customized configs in version control
Example structure:
```
your-project/
├── configs/
│ ├── dev-run.yaml
│ ├── prod-run.yaml
└── README.md
```
The goal is to take the generated template and adapt it to your specific infrastructure and operational needs.

View file

@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
set -euo pipefail
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
K8S_DIR="${SCRIPT_DIR}/../k8s"
echo "Setting up AWS EKS-specific storage class..."
kubectl apply -f gp3-topology-aware.yaml
echo "Running main Kubernetes deployment..."
cd "${K8S_DIR}"
./apply.sh "$@"

View file

@ -0,0 +1,15 @@
# Set up default storage class on AWS EKS
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: gp3-topology-aware
annotations:
storageclass.kubernetes.io/is-default-class: "true"
parameters:
type: gp3
iops: "3000"
throughput: "125"
provisioner: ebs.csi.aws.com
reclaimPolicy: Delete
volumeBindingMode: WaitForFirstConsumer
allowVolumeExpansion: true

View file

@ -6,13 +6,9 @@ This section provides an overview of the distributions available in Llama Stack.
```{toctree}
:maxdepth: 3
list_of_distributions
building_distro
customizing_run_yaml
importing_as_library
configuration
list_of_distributions
kubernetes_deployment
building_distro
on_device_distro
remote_hosted_distro
self_hosted_distro
```

View file

@ -6,16 +6,47 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
export POSTGRES_USER=${POSTGRES_USER:-llamastack}
export POSTGRES_DB=${POSTGRES_DB:-llamastack}
export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
export POSTGRES_USER=llamastack
export POSTGRES_DB=llamastack
export POSTGRES_PASSWORD=llamastack
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
# HF_TOKEN should be set by the user; base64 encode it for the secret
if [ -n "${HF_TOKEN:-}" ]; then
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
else
echo "ERROR: HF_TOKEN not set. You need it for vLLM to download models from Hugging Face."
exit 1
fi
if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
exit 1
fi
if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
exit 1
fi
if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then
echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
exit 1
fi
export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
set -euo pipefail
set -x
# Apply the HF token secret if HF_TOKEN is provided
if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
fi
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -

View file

@ -0,0 +1,7 @@
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
data:
token: ${HF_TOKEN_BASE64}

View file

@ -22,10 +22,10 @@ data:
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:4096}
api_token: ${env.VLLM_API_TOKEN:fake}
tls_verify: ${env.VLLM_TLS_VERIFY:true}
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -33,7 +33,7 @@ data:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:+}
url: ${env.CHROMADB_URL:=}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
@ -48,7 +48,7 @@ data:
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
responses_store:
type: postgres
@ -61,8 +61,8 @@ data:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: ${env.OTEL_SERVICE_NAME:+}
sinks: ${env.TELEMETRY_SINKS:console}
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
@ -122,6 +122,9 @@ data:
provider_id: rag-runtime
server:
port: 8321
auth:
provider_config:
type: github_token
kind: ConfigMap
metadata:
creationTimestamp: null

View file

@ -27,7 +27,7 @@ spec:
spec:
containers:
- name: llama-stack
image: llamastack/distribution-remote-vllm:latest
image: llamastack/distribution-starter:latest
imagePullPolicy: Always # since we have specified latest instead of a version
env:
- name: ENABLE_CHROMADB

View file

@ -30,7 +30,7 @@ providers:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:+}
url: ${env.CHROMADB_URL:=}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
@ -58,8 +58,8 @@ providers:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: ${env.OTEL_SERVICE_NAME:+console}
sinks: ${env.TELEMETRY_SINKS:+console}
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
@ -119,3 +119,6 @@ tool_groups:
provider_id: rag-runtime
server:
port: 8321
auth:
provider_config:
type: github_token

View file

@ -26,6 +26,12 @@ spec:
value: "http://llama-stack-service:8321"
- name: LLAMA_STACK_UI_PORT
value: "8322"
- name: GITHUB_CLIENT_ID
value: "${GITHUB_CLIENT_ID}"
- name: GITHUB_CLIENT_SECRET
value: "${GITHUB_CLIENT_SECRET}"
- name: NEXTAUTH_URL
value: "${LLAMA_STACK_UI_URL}:8322"
args:
- -c
- |

View file

@ -25,23 +25,17 @@ spec:
app.kubernetes.io/name: vllm
workload-type: inference
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
nodeSelector:
eks.amazonaws.com/nodegroup: gpu
containers:
- name: vllm
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6 --enable-auto-tool-choice --tool-call-parser llama4_pythonic"
env:
- name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
@ -49,6 +43,11 @@ spec:
key: token
ports:
- containerPort: 8000
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface

View file

@ -6,7 +6,6 @@ spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
storageClassName: gp2
resources:
requests:
storage: 30Gi
@ -26,16 +25,8 @@ spec:
app.kubernetes.io/name: vllm-safety
workload-type: inference
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
nodeSelector:
eks.amazonaws.com/nodegroup: gpu
containers:
- name: vllm-safety
image: vllm/vllm-openai:latest
@ -44,6 +35,8 @@ spec:
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
]
env:
- name: SAFETY_MODEL
value: "${SAFETY_MODEL}"
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
@ -51,6 +44,11 @@ spec:
key: token
ports:
- containerPort: 8001
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface

View file

@ -39,6 +39,13 @@ docker pull llama-stack/distribution-meta-reference-gpu
**Guides:** [Meta Reference GPU Guide](self_hosted_distro/meta-reference-gpu)
### 🖥️ Self-Hosted with NVIDA NeMo Microservices
**Use `nvidia` if you:**
- Want to use Llama Stack with NVIDIA NeMo Microservices
**Guides:** [NVIDIA Distribution Guide](self_hosted_distro/nvidia)
### ☁️ Managed Hosting
**Use remote-hosted endpoints if you:**

View file

@ -13,7 +13,7 @@ Latest Release Notes: [link](https://github.com/meta-llama/llama-stack-client-ko
*Tagged releases are stable versions of the project. While we strive to maintain a stable main branch, it's not guaranteed to be free of bugs or issues.*
## Android Demo App
Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-client-kotlin/tree/examples/android_app)
Check out our demo app to see how to integrate Llama Stack into your Android app: [Android Demo App](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app)
The key files in the app are `ExampleLlamaStackLocalInference.kt`, `ExampleLlamaStackRemoteInference.kts`, and `MainActivity.java`. With encompassed business logic, the app shows how to use Llama Stack for both the environments.
@ -68,7 +68,7 @@ Ensure the Llama Stack server version is the same as the Kotlin SDK Library for
Other inference providers: [Table](https://llama-stack.readthedocs.io/en/latest/index.html#supported-llama-stack-implementations)
How to set remote localhost in Demo App: [Settings](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#settings)
How to set remote localhost in Demo App: [Settings](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#settings)
### Initialize the Client
A client serves as the primary interface for interacting with a specific inference type and its associated parameters. Only after client is initialized then you can configure and start inferences.
@ -135,7 +135,7 @@ val result = client!!.inference().chatCompletionStreaming(
### Setup Custom Tool Calling
Android demo app for more details: [Custom Tool Calling](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#tool-calling)
Android demo app for more details: [Custom Tool Calling](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release/examples/android_app#tool-calling)
## Advanced Users

View file

@ -0,0 +1,177 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# NVIDIA Distribution
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `inline::localfs`, `remote::nvidia` |
| eval | `remote::nvidia` |
| inference | `remote::nvidia` |
| post_training | `remote::nvidia` |
| safety | `remote::nvidia` |
| scoring | `inline::basic` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `inline::rag-runtime` |
| vector_io | `inline::faiss` |
### Environment Variables
The following environment variables can be configured:
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`)
- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)
### Models
The following models are available by default:
- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)`
- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)`
- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)`
- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)`
- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)`
- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)`
- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)`
- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)`
- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)`
- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)`
- `nvidia/llama-3.2-nv-embedqa-1b-v2 `
- `nvidia/nv-embedqa-e5-v5 `
- `nvidia/nv-embedqa-mistral-7b-v2 `
- `snowflake/arctic-embed-l `
## Prerequisites
### NVIDIA API Keys
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
### Deploy NeMo Microservices Platform
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
## Supported Services
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
### Inference: NVIDIA NIM
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
### Datasetio API: NeMo Data Store
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage.
### Eval API: NeMo Evaluator
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage.
### Post-Training API: NeMo Customizer
The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage.
### Safety API: NeMo Guardrails
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage.
## Deploying models
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
```sh
# URL to NeMo NIM Proxy service
export NEMO_URL="http://nemo.test"
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"name": "llama-3.2-1b-instruct",
"namespace": "meta",
"config": {
"model": "meta/llama-3.2-1b-instruct",
"nim_deployment": {
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
"image_tag": "1.8.3",
"pvc_size": "25Gi",
"gpu": 1,
"additional_envs": {
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
}
}
}
}'
```
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
You can also remove a deployed NIM to free up GPU resources, if needed.
```sh
export NEMO_URL="http://nemo.test"
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
```
## Running Llama Stack with NVIDIA
You can do this via Conda or venv (build code), or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=8321
docker run \
-it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run.yaml:/root/my-run.yaml \
llamastack/distribution-nvidia \
--config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
```
### Via Conda
```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
llama stack build --template nvidia --image-type conda
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL
```
### Via venv
If you've set up your local development environment, you can also build the image using your local virtual environment.
```bash
INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct
llama stack build --template nvidia --image-type venv
llama stack run ./run.yaml \
--port 8321 \
--env NVIDIA_API_KEY=$NVIDIA_API_KEY \
--env INFERENCE_MODEL=$INFERENCE_MODEL
```
## Example Notebooks
For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`.

View file

@ -17,18 +17,18 @@ The `llamastack/distribution-starter` distribution is a comprehensive, multi-pro
The starter distribution consists of the following provider configurations:
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| files | `inline::localfs` |
| API | Provider(s) |
|-----|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| files | `inline::localfs` |
| inference | `remote::openai`, `remote::fireworks`, `remote::together`, `remote::ollama`, `remote::anthropic`, `remote::gemini`, `remote::groq`, `remote::sambanova`, `remote::vllm`, `remote::tgi`, `remote::cerebras`, `remote::llama-openai-compat`, `remote::nvidia`, `remote::hf::serverless`, `remote::hf::endpoint`, `inline::sentence-transformers` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `inline::sqlite-vec`, `inline::milvus`, `remote::chromadb`, `remote::pgvector` |
## Inference Providers
@ -167,7 +167,7 @@ When using the `:` pattern (like `${env.OLLAMA_INFERENCE_MODEL:__disabled__}`),
## Running the Distribution
You can run the starter distribution via Docker or Conda.
You can run the starter distribution via Docker, Conda, or venv.
### Via Docker
@ -186,17 +186,12 @@ docker run \
--port $LLAMA_STACK_PORT
```
### Via Conda
### Via Conda or venv
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
Ensure you have configured the starter distribution using the environment variables explained above.
```bash
llama stack build --template starter --image-type conda
llama stack run distributions/starter/run.yaml \
--port 8321 \
--env OPENAI_API_KEY=your_openai_key \
--env FIREWORKS_API_KEY=your_fireworks_key \
--env TOGETHER_API_KEY=your_together_key
uv run --with llama-stack llama stack build --template starter --image-type <conda|venv> --run
```
## Example Usage

View file

@ -28,5 +28,4 @@ If you have built a container image and want to deploy it in a Kubernetes cluste
importing_as_library
configuration
kubernetes_deployment
```

View file

@ -1,4 +1,4 @@
# Detailed Tutorial
## Detailed Tutorial
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple agent.
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with
@ -10,7 +10,7 @@ Llama Stack is a stateful service with REST APIs to support seamless transition
In this guide, we'll walk through how to build a RAG agent locally using Llama Stack with [Ollama](https://ollama.com/)
as the inference [provider](../providers/index.md#inference) for a Llama Model.
## Step 1: Installation and Setup
### Step 1: Installation and Setup
Install Ollama by following the instructions on the [Ollama website](https://ollama.com/download), then
download Llama 3.2 3B model, and then start the Ollama service.
@ -42,10 +42,10 @@ powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | ie
Setup your virtual environment.
```bash
uv sync --python 3.10
uv sync --python 3.12
source .venv/bin/activate
```
## Step 2: Run Llama Stack
### Step 2: Run Llama Stack
Llama Stack is a server that exposes multiple APIs, you connect with it using the Llama Stack client SDK.
::::{tab-set}
@ -54,11 +54,12 @@ Llama Stack is a server that exposes multiple APIs, you connect with it using th
You can use Python to build and run the Llama Stack server, which is useful for testing and development.
Llama Stack uses a [YAML configuration file](../distributions/configuration.md) to specify the stack setup,
which defines the providers and their settings.
which defines the providers and their settings. The generated configuration serves as a starting point that you can [customize for your specific needs](../distributions/customizing_run_yaml.md).
Now let's build and run the Llama Stack config for Ollama.
We use `starter` as template. By default all providers are disabled, this requires enable ollama by passing environment variables.
```bash
INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type venv --run
ENABLE_OLLAMA=ollama OLLAMA_INFERENCE_MODEL="llama3.2:3b" llama stack build --template starter --image-type venv --run
```
:::
:::{tab-item} Using `conda`
@ -69,17 +70,18 @@ which defines the providers and their settings.
Now let's build and run the Llama Stack config for Ollama.
```bash
INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type conda --image-name llama3-3b-conda --run
ENABLE_OLLAMA=ollama INFERENCE_MODEL="llama3.2:3b" llama stack build --template starter --image-type conda --run
```
:::
:::{tab-item} Using a Container
You can use a container image to run the Llama Stack server. We provide several container images for the server
component that works with different inference providers out of the box. For this guide, we will use
`llamastack/distribution-ollama` as the container image. If you'd like to build your own image or customize the
configurations, please check out [this guide](../references/index.md).
`llamastack/distribution-starter` as the container image. If you'd like to build your own image or customize the
configurations, please check out [this guide](../distributions/building_distro.md).
First lets setup some environment variables and create a local directory to mount into the containers file system.
```bash
export INFERENCE_MODEL="llama3.2:3b"
export ENABLE_OLLAMA=ollama
export LLAMA_STACK_PORT=8321
mkdir -p ~/.llama
```
@ -90,7 +92,7 @@ docker run -it \
--pull always \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
llamastack/distribution-ollama \
llamastack/distribution-starter \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
@ -112,7 +114,7 @@ docker run -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
--network=host \
llamastack/distribution-ollama \
llamastack/distribution-starter \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://localhost:11434
@ -130,7 +132,7 @@ Now you can use the Llama Stack client to run inference and build agents!
You can reuse the server setup or use the [Llama Stack Client](https://github.com/meta-llama/llama-stack-client-python/).
Note that the client package is already included in the `llama-stack` package.
## Step 3: Run Client CLI
### Step 3: Run Client CLI
Open a new terminal and navigate to the same directory you started the server from. Then set up a new or activate your
existing server virtual environment.
@ -146,7 +148,7 @@ source .venv/bin/activate
:::{tab-item} Install with `venv`
```bash
uv venv client --python 3.10
uv venv client --python 3.12
source client/bin/activate
pip install llama-stack-client
```
@ -154,7 +156,7 @@ pip install llama-stack-client
:::{tab-item} Install with `conda`
```bash
yes | conda create -n stack-client python=3.10
yes | conda create -n stack-client python=3.12
conda activate stack-client
pip install llama-stack-client
```
@ -177,41 +179,60 @@ List the models
llama-stack-client models list
Available Models
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ embedding │ all-MiniLM-L6-v2 │ all-minilm:latest │ {'embedding_dimension': 384.0} │ ollama │
├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼─────────────────┤
llm │ llama3.2:3b │ llama3.2:3b │ │ ollama
└─────────────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘
Total models: 2
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━
┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id
┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━
│ embedding │ ollama/all-minilm:l6-v2 │ all-minilm:l6-v2 │ {'embedding_dimension': 384.0} │ ollama
├─────────────────┼─────────────────────────────────────┼─────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────
... │ ... │ ... │ │ ...
├─────────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┼───────────────────────┤
│ llm │ ollama/Llama-3.2:3b │ llama3.2:3b │ │ ollama │
└─────────────────┴─────────────────────────────────────┴─────────────────────────────────────┴───────────────────────────────────────────┴───────────────────────┘
```
You can test basic Llama inference completion using the CLI.
```bash
llama-stack-client inference chat-completion --message "tell me a joke"
llama-stack-client inference chat-completion --model-id "ollama/llama3.2:3b" --message "tell me a joke"
```
Sample output:
```python
ChatCompletionResponse(
completion_message=CompletionMessage(
content="Here's one:\n\nWhat do you call a fake noodle?\n\nAn impasta!",
role="assistant",
stop_reason="end_of_turn",
tool_calls=[],
),
logprobs=None,
metrics=[
Metric(metric="prompt_tokens", value=14.0, unit=None),
Metric(metric="completion_tokens", value=27.0, unit=None),
Metric(metric="total_tokens", value=41.0, unit=None),
OpenAIChatCompletion(
id="chatcmpl-08d7b2be-40f3-47ed-8f16-a6f29f2436af",
choices=[
OpenAIChatCompletionChoice(
finish_reason="stop",
index=0,
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
role="assistant",
content="Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired.",
name=None,
tool_calls=None,
refusal=None,
annotations=None,
audio=None,
function_call=None,
),
logprobs=None,
)
],
created=1751725254,
model="llama3.2:3b",
object="chat.completion",
service_tier=None,
system_fingerprint="fp_ollama",
usage={
"completion_tokens": 18,
"prompt_tokens": 29,
"total_tokens": 47,
"completion_tokens_details": None,
"prompt_tokens_details": None,
},
)
```
## Step 4: Run the Demos
### Step 4: Run the Demos
Note that these demos show the [Python Client SDK](../references/python_sdk_reference/index.md).
Other SDKs are also available, please refer to the [Client SDK](../index.md#client-sdks) list for the complete options.
@ -221,7 +242,7 @@ Other SDKs are also available, please refer to the [Client SDK](../index.md#clie
:::{tab-item} Basic Inference
Now you can run inference using the Llama Stack client SDK.
### i. Create the Script
#### i. Create the Script
Create a file `inference.py` and add the following code:
```python
@ -233,40 +254,36 @@ client = LlamaStackClient(base_url="http://localhost:8321")
models = client.models.list()
# Select the first LLM
llm = next(m for m in models if m.model_type == "llm")
llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
model_id = llm.identifier
print("Model:", model_id)
response = client.inference.chat_completion(
model_id=model_id,
response = client.chat.completions.create(
model=model_id,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a haiku about coding"},
],
)
print(response.completion_message.content)
print(response)
```
### ii. Run the Script
#### ii. Run the Script
Let's run the script using `uv`
```bash
uv run python inference.py
```
Which will output:
```
Model: llama3.2:3b
Here is a haiku about coding:
Lines of code unfold
Logic flows through digital night
Beauty in the bits
Model: ollama/llama3.2:3b
OpenAIChatCompletion(id='chatcmpl-30cd0f28-a2ad-4b6d-934b-13707fc60ebf', choices=[OpenAIChatCompletionChoice(finish_reason='stop', index=0, message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(role='assistant', content="Lines of code unfold\nAlgorithms dance with ease\nLogic's gentle kiss", name=None, tool_calls=None, refusal=None, annotations=None, audio=None, function_call=None), logprobs=None)], created=1751732480, model='llama3.2:3b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage={'completion_tokens': 16, 'prompt_tokens': 37, 'total_tokens': 53, 'completion_tokens_details': None, 'prompt_tokens_details': None})
```
:::
:::{tab-item} Build a Simple Agent
Next we can move beyond simple inference and build an agent that can perform tasks using the Llama Stack server.
### i. Create the Script
#### i. Create the Script
Create a file `agent.py` and add the following code:
```python
@ -278,7 +295,7 @@ import uuid
client = LlamaStackClient(base_url=f"http://localhost:8321")
models = client.models.list()
llm = next(m for m in models if m.model_type == "llm")
llm = next(m for m in models if m.model_type == "llm" and m.provider_id == "ollama")
model_id = llm.identifier
agent = Agent(client, model=model_id, instructions="You are a helpful assistant.")
@ -315,19 +332,20 @@ uv run python agent.py
```{dropdown} 👋 Click here to see the sample output
Non-streaming ...
agent> I'm an artificial intelligence designed to assist and communicate with users like you. I don't have a personal identity, but I'm here to provide information, answer questions, and help with tasks to the best of my abilities.
agent> I'm an artificial intelligence designed to assist and communicate with users like you. I don't have a personal identity, but I can provide information, answer questions, and help with tasks to the best of my abilities.
I can be used for a wide range of purposes, such as:
I'm a large language model, which means I've been trained on a massive dataset of text from various sources, allowing me to understand and respond to a wide range of topics and questions. My purpose is to provide helpful and accurate information, and I'm constantly learning and improving my responses based on the interactions I have with users like you.
I can help with:
* Answering questions on various subjects
* Providing definitions and explanations
* Offering suggestions and ideas
* Helping with language translation
* Assisting with writing and proofreading
* Generating text or responses to questions
* Playing simple games or chatting about topics of interest
I'm constantly learning and improving my abilities, so feel free to ask me anything, and I'll do my best to help!
* Assisting with language-related tasks, such as proofreading and editing
* Generating text and content
* And more!
Feel free to ask me anything, and I'll do my best to help!
Streaming ...
AgentTurnResponseStreamChunk(
│ event=TurnResponseEvent(
@ -421,15 +439,15 @@ uv run python agent.py
Streaming with print helper...
inference> Déjà vu!
inference> Déjà vu! You're asking me again!
As I mentioned earlier, I'm an artificial intelligence language model. I don't have a personal identity or consciousness like humans do. I exist solely to process and respond to text-based inputs, providing information and assistance on a wide range of topics.
As I mentioned earlier, I'm a computer program designed to simulate conversation and answer questions. I don't have a personal identity or consciousness like a human would. I exist solely as a digital entity, running on computer servers and responding to inputs from users like you.
I'm a computer program designed to simulate human-like conversations, using natural language processing (NLP) and machine learning algorithms to understand and generate responses. My purpose is to help users like you with their questions, provide information, and engage in conversation.
I'm a type of artificial intelligence (AI) called a large language model, which means I've been trained on a massive dataset of text from various sources. This training allows me to understand and respond to a wide range of questions and topics.
Think of me as a virtual companion, a helpful tool designed to make your interactions more efficient and enjoyable. I don't have personal opinions, emotions, or biases, but I'm here to provide accurate and informative responses to the best of my abilities.
My purpose is to provide helpful and accurate information, answer questions, and assist users like you with tasks and conversations. I don't have personal preferences, emotions, or opinions like humans do. My goal is to be informative, neutral, and respectful in my responses.
So, who am I? I'm just a computer program designed to help you!
So, that's me in a nutshell!
```
:::
@ -437,7 +455,7 @@ uv run python agent.py
For our last demo, we can build a RAG agent that can answer questions about the Torchtune project using the documents
in a vector database.
### i. Create the Script
#### i. Create the Script
Create a file `rag_agent.py` and add the following code:
```python
@ -483,7 +501,11 @@ client.tool_runtime.rag_tool.insert(
)
# Get the model being served
llm = next(m for m in client.models.list() if m.model_type == "llm")
llm = next(
m
for m in client.models.list()
if m.model_type == "llm" and m.provider_id == "ollama"
)
model = llm.identifier
# Create the RAG agent
@ -511,7 +533,7 @@ for t in turns:
for event in AgentEventLogger().log(stream):
event.print()
```
### ii. Run the Script
#### ii. Run the Script
Let's run the script using `uv`
```bash
uv run python rag_agent.py

View file

@ -1,123 +1,13 @@
# Quickstart
# Getting Started
Get started with Llama Stack in minutes!
Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
as the inference [provider](../providers/inference/index) for a Llama Model.
**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
#### Step 1: Install and setup
1. Install [uv](https://docs.astral.sh/uv/)
2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
```bash
ollama run llama3.2:3b --keepalive 60m
```{include} quickstart.md
:start-after: ## Quickstart
```
#### Step 2: Run the Llama Stack server
We will use `uv` to run the Llama Stack server.
```bash
INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
```{include} libraries.md
:start-after: ## Libraries (SDKs)
```
#### Step 3: Run the demo
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
```python
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
vector_db_id = "my_demo_vector_db"
client = LlamaStackClient(base_url="http://localhost:8321")
models = client.models.list()
# Select the first LLM and first embedding models
model_id = next(m for m in models if m.model_type == "llm").identifier
embedding_model_id = (
em := next(m for m in models if m.model_type == "embedding")
).identifier
embedding_dimension = em.metadata["embedding_dimension"]
_ = client.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
provider_id="faiss",
)
source = "https://www.paulgraham.com/greatwork.html"
print("rag_tool> Ingesting document:", source)
document = RAGDocument(
document_id="document_1",
content=source,
mime_type="text/html",
metadata={},
)
client.tool_runtime.rag_tool.insert(
documents=[document],
vector_db_id=vector_db_id,
chunk_size_in_tokens=50,
)
agent = Agent(
client,
model=model_id,
instructions="You are a helpful assistant",
tools=[
{
"name": "builtin::rag/knowledge_search",
"args": {"vector_db_ids": [vector_db_id]},
}
],
)
prompt = "How do you do great work?"
print("prompt>", prompt)
response = agent.create_turn(
messages=[{"role": "user", "content": prompt}],
session_id=agent.create_session("rag_session"),
stream=True,
)
for log in AgentEventLogger().log(response):
log.print()
```{include} detailed_tutorial.md
:start-after: ## Detailed Tutorial
```
We will use `uv` to run the script
```
uv run --with llama-stack-client,fire,requests demo_script.py
```
And you should see output like below.
```
rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
prompt> How do you do great work?
inference> [knowledge_search(query="What is the key to doing great work")]
tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
To further clarify, I would suggest that doing great work involves:
* Completing tasks with high quality and attention to detail
* Expanding on existing knowledge or ideas
* Making a positive impact on others through your work
* Striving for excellence and continuous improvement
Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
```
Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
## Next Steps
Now you're ready to dive deeper into Llama Stack!
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
- Learn about Llama Stack [Concepts](../concepts/index.md).
- Discover how to [Build Llama Stacks](../distributions/index.md).
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.

View file

@ -0,0 +1,10 @@
## Libraries (SDKs)
We have a number of client-side SDKs available for different languages.
| **Language** | **Client SDK** | **Package** |
| :----: | :----: | :----: |
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)

View file

@ -0,0 +1,129 @@
## Quickstart
Get started with Llama Stack in minutes!
Llama Stack is a stateful service with REST APIs to support the seamless transition of AI applications across different
environments. You can build and test using a local server first and deploy to a hosted endpoint for production.
In this guide, we'll walk through how to build a RAG application locally using Llama Stack with [Ollama](https://ollama.com/)
as the inference [provider](../providers/inference/index) for a Llama Model.
**💡 Notebook Version:** You can also follow this quickstart guide in a Jupyter notebook format: [quick_start.ipynb](https://github.com/meta-llama/llama-stack/blob/main/docs/quick_start.ipynb)
#### Step 1: Install and setup
1. Install [uv](https://docs.astral.sh/uv/)
2. Run inference on a Llama model with [Ollama](https://ollama.com/download)
```bash
ollama run llama3.2:3b --keepalive 60m
```
#### Step 2: Run the Llama Stack server
We will use `uv` to run the Llama Stack server.
```bash
ENABLE_OLLAMA=ollama OLLAMA_INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run
```
#### Step 3: Run the demo
Now open up a new terminal and copy the following script into a file named `demo_script.py`.
```python
from llama_stack_client import Agent, AgentEventLogger, RAGDocument, LlamaStackClient
vector_db_id = "my_demo_vector_db"
client = LlamaStackClient(base_url="http://localhost:8321")
models = client.models.list()
# Select the first LLM and first embedding models
model_id = next(m for m in models if m.model_type == "llm").identifier
embedding_model_id = (
em := next(m for m in models if m.model_type == "embedding")
).identifier
embedding_dimension = em.metadata["embedding_dimension"]
_ = client.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model=embedding_model_id,
embedding_dimension=embedding_dimension,
provider_id="faiss",
)
source = "https://www.paulgraham.com/greatwork.html"
print("rag_tool> Ingesting document:", source)
document = RAGDocument(
document_id="document_1",
content=source,
mime_type="text/html",
metadata={},
)
client.tool_runtime.rag_tool.insert(
documents=[document],
vector_db_id=vector_db_id,
chunk_size_in_tokens=50,
)
agent = Agent(
client,
model=model_id,
instructions="You are a helpful assistant",
tools=[
{
"name": "builtin::rag/knowledge_search",
"args": {"vector_db_ids": [vector_db_id]},
}
],
)
prompt = "How do you do great work?"
print("prompt>", prompt)
response = agent.create_turn(
messages=[{"role": "user", "content": prompt}],
session_id=agent.create_session("rag_session"),
stream=True,
)
for log in AgentEventLogger().log(response):
log.print()
```
We will use `uv` to run the script
```
uv run --with llama-stack-client,fire,requests demo_script.py
```
And you should see output like below.
```
rag_tool> Ingesting document: https://www.paulgraham.com/greatwork.html
prompt> How do you do great work?
inference> [knowledge_search(query="What is the key to doing great work")]
tool_execution> Tool:knowledge_search Args:{'query': 'What is the key to doing great work'}
tool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 2:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 3:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 4:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text="Result 5:\nDocument_id:docum\nContent: work. Doing great work means doing something important\nso well that you expand people's ideas of what's possible. But\nthere's no threshold for importance. It's a matter of degree, and\noften hard to judge at the time anyway.\n", type='text'), TextContentItem(text='END of knowledge_search tool results.\n', type='text')]
inference> Based on the search results, it seems that doing great work means doing something important so well that you expand people's ideas of what's possible. However, there is no clear threshold for importance, and it can be difficult to judge at the time.
To further clarify, I would suggest that doing great work involves:
* Completing tasks with high quality and attention to detail
* Expanding on existing knowledge or ideas
* Making a positive impact on others through your work
* Striving for excellence and continuous improvement
Ultimately, great work is about making a meaningful contribution and leaving a lasting impression.
```
Congratulations! You've successfully built your first RAG application using Llama Stack! 🎉🥳
```{admonition} HuggingFace access
:class: tip
If you are getting a **401 Client Error** from HuggingFace for the **all-MiniLM-L6-v2** model, try setting **HF_TOKEN** to a valid HuggingFace token in your environment
```
### Next Steps
Now you're ready to dive deeper into Llama Stack!
- Explore the [Detailed Tutorial](./detailed_tutorial.md).
- Try the [Getting Started Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb).
- Browse more [Notebooks on GitHub](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks).
- Learn about Llama Stack [Concepts](../concepts/index.md).
- Discover how to [Build Llama Stacks](../distributions/index.md).
- Refer to our [References](../references/index.md) for details on the Llama CLI and Python SDK.
- Check out the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repository for example applications and tutorials.

View file

@ -40,17 +40,6 @@ Kotlin.
- Ready to build? Check out the [Quick Start](getting_started/index) to get started.
- Want to contribute? See the [Contributing](contributing/index) guide.
## Client SDKs
We have a number of client-side SDKs available for different languages.
| **Language** | **Client SDK** | **Package** |
| :----: | :----: | :----: |
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift/tree/latest-release) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin/tree/latest-release) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
## Supported Llama Stack Implementations
A number of "adapters" are available for some popular Inference and Vector Store providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
@ -133,14 +122,12 @@ A number of "adapters" are available for some popular Inference and Vector Store
self
getting_started/index
getting_started/detailed_tutorial
introduction/index
concepts/index
openai/index
providers/index
distributions/index
advanced_apis/index
building_applications/index
playground/index
deploying/index
contributing/index
references/index
```

View file

@ -1,4 +1,4 @@
# Providers Overview
# API Providers Overview
The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
- LLM inference providers (e.g., Meta Reference, Ollama, Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, vLLM, OpenAI, Anthropic, Gemini, WatsonX, etc.),
@ -13,13 +13,25 @@ Providers come in two flavors:
Importantly, Llama Stack always strives to provide at least one fully inline provider for each API so you can iterate on a fully featured environment locally.
## External Providers
Llama Stack supports external providers that live outside of the main codebase. This allows you to create and maintain your own providers independently.
```{toctree}
:maxdepth: 1
external
external.md
```
```{include} openai.md
:start-after: ## OpenAI API Compatibility
```
## Inference
Runs inference with an LLM.
```{toctree}
:maxdepth: 1
inference/index
```
## Agents
@ -40,33 +52,6 @@ Interfaces with datasets and data loaders.
datasetio/index
```
## Eval
Generates outputs (via Inference or Agents) and perform scoring.
```{toctree}
:maxdepth: 1
eval/index
```
## Inference
Runs inference with an LLM.
```{toctree}
:maxdepth: 1
inference/index
```
## Post Training
Fine-tunes a model.
```{toctree}
:maxdepth: 1
post_training/index
```
## Safety
Applies safety policies to the output at a Systems (not only model) level.
@ -76,15 +61,6 @@ Applies safety policies to the output at a Systems (not only model) level.
safety/index
```
## Scoring
Evaluates the outputs of the system.
```{toctree}
:maxdepth: 1
scoring/index
```
## Telemetry
Collects telemetry data from the system.
@ -94,15 +70,6 @@ Collects telemetry data from the system.
telemetry/index
```
## Tool Runtime
Is associated with the ToolGroup resouces.
```{toctree}
:maxdepth: 1
tool_runtime/index
```
## Vector IO
Vector IO refers to operations on vector databases, such as adding documents, searching, and deleting documents.
@ -114,3 +81,12 @@ io and database are used to store and retrieve documents for retrieval.
vector_io/index
```
## Tool Runtime
Is associated with the ToolGroup resources.
```{toctree}
:maxdepth: 1
tool_runtime/index
```

View file

@ -1,14 +1,14 @@
# OpenAI API Compatibility
## OpenAI API Compatibility
## Server path
### Server path
Llama Stack exposes an OpenAI-compatible API endpoint at `/v1/openai/v1`. So, for a Llama Stack server running locally on port `8321`, the full url to the OpenAI-compatible API endpoint is `http://localhost:8321/v1/openai/v1`.
## Clients
### Clients
You should be able to use any client that speaks OpenAI APIs with Llama Stack. We regularly test with the official Llama Stack clients as well as OpenAI's official Python client.
### Llama Stack Client
#### Llama Stack Client
When using the Llama Stack client, set the `base_url` to the root of your Llama Stack server. It will automatically route OpenAI-compatible requests to the right server endpoint for you.
@ -18,7 +18,7 @@ from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://localhost:8321")
```
### OpenAI Client
#### OpenAI Client
When using an OpenAI client, set the `base_url` to the `/v1/openai/v1` path on your Llama Stack server.
@ -30,9 +30,9 @@ client = OpenAI(base_url="http://localhost:8321/v1/openai/v1", api_key="none")
Regardless of the client you choose, the following code examples should all work the same.
## APIs implemented
### APIs implemented
### Models
#### Models
Many of the APIs require you to pass in a model parameter. To see the list of models available in your Llama Stack server:
@ -40,13 +40,13 @@ Many of the APIs require you to pass in a model parameter. To see the list of mo
models = client.models.list()
```
### Responses
#### Responses
:::{note}
The Responses API implementation is still in active development. While it is quite usable, there are still unimplemented parts of the API. We'd love feedback on any use-cases you try that do not work to help prioritize the pieces left to implement. Please open issues in the [meta-llama/llama-stack](https://github.com/meta-llama/llama-stack) GitHub repository with details of anything that does not work.
:::
#### Simple inference
##### Simple inference
Request:
@ -66,7 +66,7 @@ Syntax whispers secrets sweet
Code's gentle silence
```
#### Structured Output
##### Structured Output
Request:
@ -106,9 +106,9 @@ Example output:
{ "participants": ["Alice", "Bob"] }
```
### Chat Completions
#### Chat Completions
#### Simple inference
##### Simple inference
Request:
@ -129,7 +129,7 @@ Logic flows like a river
Code's gentle beauty
```
#### Structured Output
##### Structured Output
Request:
@ -170,9 +170,9 @@ Example output:
{ "participants": ["Alice", "Bob"] }
```
### Completions
#### Completions
#### Simple inference
##### Simple inference
Request:

View file

@ -11,7 +11,8 @@ Please refer to the remote provider documentation.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | PydanticUndefined | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
## Sample Configuration

View file

@ -205,12 +205,16 @@ See [sqlite-vec's GitHub repo](https://github.com/asg017/sqlite-vec/tree/main) f
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | PydanticUndefined | |
| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
## Sample Configuration
```yaml
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
```

View file

@ -10,12 +10,16 @@ Please refer to the sqlite-vec provider documentation.
| Field | Type | Required | Default | Description |
|-------|------|----------|---------|-------------|
| `db_path` | `<class 'str'>` | No | PydanticUndefined | |
| `db_path` | `<class 'str'>` | No | PydanticUndefined | Path to the SQLite database file |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend (SQLite only for now) |
## Sample Configuration
```yaml
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec.db
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/sqlite_vec_registry.db
```

View file

@ -114,6 +114,7 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
| `uri` | `<class 'str'>` | No | PydanticUndefined | The URI of the Milvus server |
| `token` | `str \| None` | No | PydanticUndefined | The token of the Milvus server |
| `consistency_level` | `<class 'str'>` | No | Strong | The consistency level of the Milvus server |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig` | No | sqlite | Config for KV store backend |
| `config` | `dict` | No | {} | This configuration allows additional fields to be passed through to the underlying Milvus client. See the [Milvus](https://milvus.io/docs/install-overview.md) documentation for more details about Milvus in general. |
> **Note**: This configuration class accepts additional fields beyond those listed above. You can pass any additional configuration options that will be forwarded to the underlying provider.
@ -123,6 +124,9 @@ For more details on TLS configuration, refer to the [TLS setup guide](https://mi
```yaml
uri: ${env.MILVUS_ENDPOINT}
token: ${env.MILVUS_TOKEN}
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/milvus_remote_registry.db
```

View file

@ -40,6 +40,7 @@ See [PGVector's documentation](https://github.com/pgvector/pgvector) for more de
| `db` | `str \| None` | No | postgres | |
| `user` | `str \| None` | No | postgres | |
| `password` | `str \| None` | No | mysecretpassword | |
| `kvstore` | `utils.kvstore.config.RedisKVStoreConfig \| utils.kvstore.config.SqliteKVStoreConfig \| utils.kvstore.config.PostgresKVStoreConfig \| utils.kvstore.config.MongoDBKVStoreConfig, annotation=NoneType, required=False, default='sqlite', discriminator='type'` | No | | Config for KV store backend (SQLite only for now) |
## Sample Configuration
@ -49,6 +50,9 @@ port: ${env.PGVECTOR_PORT:=5432}
db: ${env.PGVECTOR_DB}
user: ${env.PGVECTOR_USER}
password: ${env.PGVECTOR_PASSWORD}
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/pgvector_registry.db
```

View file

@ -36,7 +36,9 @@ See [Weaviate's documentation](https://weaviate.io/developers/weaviate) for more
## Sample Configuration
```yaml
{}
kvstore:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:=~/.llama/dummy}/weaviate_registry.db
```

View file

@ -9,7 +9,8 @@ The `llama-stack-client` CLI allows you to query information about the distribut
llama-stack-client
Usage: llama-stack-client [OPTIONS] COMMAND [ARGS]...
Welcome to the LlamaStackClient CLI
Welcome to the llama-stack-client CLI - a command-line interface for
interacting with Llama Stack
Options:
--version Show the version and exit.
@ -35,6 +36,7 @@ Commands:
```
### `llama-stack-client configure`
Configure Llama Stack Client CLI.
```bash
llama-stack-client configure
> Enter the host name of the Llama Stack distribution server: localhost
@ -42,7 +44,24 @@ llama-stack-client configure
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
```
Optional arguments:
- `--endpoint`: Llama Stack distribution endpoint
- `--api-key`: Llama Stack distribution API key
## `llama-stack-client inspect version`
Inspect server configuration.
```bash
llama-stack-client inspect version
```
```bash
VersionInfo(version='0.2.14')
```
### `llama-stack-client providers list`
Show available providers on distribution endpoint
```bash
llama-stack-client providers list
```
@ -66,9 +85,74 @@ llama-stack-client providers list
+-----------+----------------+-----------------+
```
### `llama-stack-client providers inspect`
Show specific provider configuration on distribution endpoint
```bash
llama-stack-client providers inspect <provider_id>
```
## Inference
Inference (chat).
### `llama-stack-client inference chat-completion`
Show available inference chat completion endpoints on distribution endpoint
```bash
llama-stack-client inference chat-completion --message <message> [--stream] [--session] [--model-id]
```
```bash
OpenAIChatCompletion(
id='chatcmpl-aacd11f3-8899-4ec5-ac5b-e655132f6891',
choices=[
OpenAIChatCompletionChoice(
finish_reason='stop',
index=0,
message=OpenAIChatCompletionChoiceMessageOpenAIAssistantMessageParam(
role='assistant',
content='The captain of the whaleship Pequod in Nathaniel Hawthorne\'s novel "Moby-Dick" is Captain
Ahab. He\'s a vengeful and obsessive old sailor who\'s determined to hunt down and kill the white sperm whale
Moby-Dick, whom he\'s lost his leg to in a previous encounter.',
name=None,
tool_calls=None,
refusal=None,
annotations=None,
audio=None,
function_call=None
),
logprobs=None
)
],
created=1752578797,
model='llama3.2:3b-instruct-fp16',
object='chat.completion',
service_tier=None,
system_fingerprint='fp_ollama',
usage={
'completion_tokens': 67,
'prompt_tokens': 33,
'total_tokens': 100,
'completion_tokens_details': None,
'prompt_tokens_details': None
}
)
```
Required arguments:
**Note:** At least one of these parameters is required for chat completion
- `--message`: Message
- `--session`: Start a Chat Session
Optional arguments:
- `--stream`: Stream
- `--model-id`: Model ID
## Model Management
Manage GenAI models.
### `llama-stack-client models list`
Show available llama models at distribution endpoint
```bash
llama-stack-client models list
```
@ -85,6 +169,7 @@ Total models: 1
```
### `llama-stack-client models get`
Show details of a specific model at the distribution endpoint
```bash
llama-stack-client models get Llama3.1-8B-Instruct
```
@ -105,69 +190,92 @@ Model RandomModel is not found at distribution endpoint host:port. Please ensure
```
### `llama-stack-client models register`
Register a new model at distribution endpoint
```bash
llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
llama-stack-client models register <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>] [--model-type <model_type>]
```
### `llama-stack-client models update`
Required arguments:
- `MODEL_ID`: Model ID
- `--provider-id`: Provider ID for the model
Optional arguments:
- `--provider-model-id`: Provider's model ID
- `--metadata`: JSON metadata for the model
- `--model-type`: Model type: `llm`, `embedding`
### `llama-stack-client models unregister`
Unregister a model from distribution endpoint
```bash
llama-stack-client models update <model_id> [--provider-id <provider_id>] [--provider-model-id <provider_model_id>] [--metadata <metadata>]
```
### `llama-stack-client models delete`
```bash
llama-stack-client models delete <model_id>
llama-stack-client models unregister <model_id>
```
## Vector DB Management
Manage vector databases.
### `llama-stack-client vector_dbs list`
Show available vector dbs on distribution endpoint
```bash
llama-stack-client vector_dbs list
```
```
+--------------+----------------+---------------------+---------------+------------------------+
| identifier | provider_id | provider_resource_id| vector_db_type| params |
+==============+================+=====================+===============+========================+
| test_bank | meta-reference | test_bank | vector | embedding_model: all-MiniLM-L6-v2
embedding_dimension: 384|
+--------------+----------------+---------------------+---------------+------------------------+
┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ identifier ┃ provider_id ┃ provider_resource_id ┃ vector_db_type ┃ params ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ my_demo_vector_db │ faiss │ my_demo_vector_db │ │ embedding_dimension: 384 │
│ │ │ │ │ embedding_model: all-MiniLM-L6-v2 │
│ │ │ │ │ type: vector_db │
│ │ │ │ │ │
└──────────────────────────┴─────────────┴──────────────────────────┴────────────────┴───────────────────────────────────┘
```
### `llama-stack-client vector_dbs register`
Create a new vector db
```bash
llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
```
Required arguments:
- `VECTOR_DB_ID`: Vector DB ID
Optional arguments:
- `--provider-id`: Provider ID for the vector db
- `--provider-vector-db-id`: Provider's vector db ID
- `--embedding-model`: Embedding model to use. Default: "all-MiniLM-L6-v2"
- `--embedding-model`: Embedding model to use. Default: `all-MiniLM-L6-v2`
- `--embedding-dimension`: Dimension of embeddings. Default: 384
### `llama-stack-client vector_dbs unregister`
Delete a vector db
```bash
llama-stack-client vector_dbs unregister <vector-db-id>
```
Required arguments:
- `VECTOR_DB_ID`: Vector DB ID
## Shield Management
Manage safety shield services.
### `llama-stack-client shields list`
Show available safety shields on distribution endpoint
```bash
llama-stack-client shields list
```
```
+--------------+----------+----------------+-------------+
| identifier | params | provider_id | type |
+==============+==========+================+=============+
| llama_guard | {} | meta-reference | llama_guard |
+--------------+----------+----------------+-------------+
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ identifier ┃ provider_alias ┃ params ┃ provider_id ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ ollama │ ollama/llama-guard3:1b │ │ llama-guard │
└──────────────────────────────────┴───────────────────────────────────────────────────────────────────────┴───────────────────────┴────────────────────────────────────┘
```
### `llama-stack-client shields register`
Register a new safety shield
```bash
llama-stack-client shields register --shield-id <shield-id> [--provider-id <provider-id>] [--provider-shield-id <provider-shield-id>] [--params <params>]
```
@ -180,41 +288,29 @@ Optional arguments:
- `--provider-shield-id`: Provider's shield ID
- `--params`: JSON configuration parameters for the shield
## Eval Task Management
### `llama-stack-client benchmarks list`
```bash
llama-stack-client benchmarks list
```
### `llama-stack-client benchmarks register`
```bash
llama-stack-client benchmarks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <function1> [<function2> ...] [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
```
Required arguments:
- `--eval-task-id`: ID of the eval task
- `--dataset-id`: ID of the dataset to evaluate
- `--scoring-functions`: One or more scoring functions to use for evaluation
Optional arguments:
- `--provider-id`: Provider ID for the eval task
- `--provider-eval-task-id`: Provider's eval task ID
- `--metadata`: Metadata for the eval task in JSON format
## Eval execution
Run evaluation tasks.
### `llama-stack-client eval run-benchmark`
Run a evaluation benchmark task
```bash
llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
llama-stack-client eval run-benchmark <eval-task-id1> [<eval-task-id2> ...] --eval-task-config <config-file> --output-dir <output-dir> --model-id <model-id> [--num-examples <num>] [--visualize] [--repeat-penalty <repeat-penalty>] [--top-p <top-p>] [--max-tokens <max-tokens>]
```
Required arguments:
- `--eval-task-config`: Path to the eval task config file in JSON format
- `--output-dir`: Path to the directory where evaluation results will be saved
- `--model-id`: model id to run the benchmark eval on
Optional arguments:
- `--num-examples`: Number of examples to evaluate (useful for debugging)
- `--visualize`: If set, visualizes evaluation results after completion
- `--repeat-penalty`: repeat-penalty in the sampling params to run generation
- `--top-p`: top-p in the sampling params to run generation
- `--max-tokens`: max-tokens in the sampling params to run generation
- `--temperature`: temperature in the sampling params to run generation
Example benchmark_config.json:
```json
@ -231,21 +327,55 @@ Example benchmark_config.json:
```
### `llama-stack-client eval run-scoring`
Run scoring from application datasets
```bash
llama-stack-client eval run-scoring <eval-task-id> --eval-task-config <config-file> --output-dir <output-dir> [--num-examples <num>] [--visualize]
llama-stack-client eval run-scoring <eval-task-id> --output-dir <output-dir> [--num-examples <num>] [--visualize]
```
Required arguments:
- `--eval-task-config`: Path to the eval task config file in JSON format
- `--output-dir`: Path to the directory where scoring results will be saved
Optional arguments:
- `--num-examples`: Number of examples to evaluate (useful for debugging)
- `--visualize`: If set, visualizes scoring results after completion
- `--scoring-params-config`: Path to the scoring params config file in JSON format
- `--dataset-id`: Pre-registered dataset_id to score (from llama-stack-client datasets list)
- `--dataset-path`: Path to the dataset file to score
## Eval Tasks
Manage evaluation tasks.
### `llama-stack-client eval_tasks list`
Show available eval tasks on distribution endpoint
```bash
llama-stack-client eval_tasks list
```
### `llama-stack-client eval_tasks register`
Register a new eval task
```bash
llama-stack-client eval_tasks register --eval-task-id <eval-task-id> --dataset-id <dataset-id> --scoring-functions <scoring-functions> [--provider-id <provider-id>] [--provider-eval-task-id <provider-eval-task-id>] [--metadata <metadata>]
```
Required arguments:
- `--eval-task-id`: ID of the eval task
- `--dataset-id`: ID of the dataset to evaluate
- `--scoring-functions`: Scoring functions to use for evaluation
Optional arguments:
- `--provider-id`: Provider ID for the eval task
- `--provider-eval-task-id`: Provider's eval task ID
## Tool Group Management
Manage available tool groups.
### `llama-stack-client toolgroups list`
Show available llama toolgroups at distribution endpoint
```bash
llama-stack-client toolgroups list
```
@ -260,17 +390,28 @@ llama-stack-client toolgroups list
```
### `llama-stack-client toolgroups get`
Get available llama toolgroups by id
```bash
llama-stack-client toolgroups get <toolgroup_id>
```
Shows detailed information about a specific toolgroup. If the toolgroup is not found, displays an error message.
Required arguments:
- `TOOLGROUP_ID`: ID of the tool group
### `llama-stack-client toolgroups register`
Register a new toolgroup at distribution endpoint
```bash
llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
```
Required arguments:
- `TOOLGROUP_ID`: ID of the tool group
Optional arguments:
- `--provider-id`: Provider ID for the toolgroup
- `--provider-toolgroup-id`: Provider's toolgroup ID
@ -278,6 +419,172 @@ Optional arguments:
- `--args`: JSON arguments for the toolgroup
### `llama-stack-client toolgroups unregister`
Unregister a toolgroup from distribution endpoint
```bash
llama-stack-client toolgroups unregister <toolgroup_id>
```
Required arguments:
- `TOOLGROUP_ID`: ID of the tool group
## Datasets Management
Manage datasets.
### `llama-stack-client datasets list`
Show available datasets on distribution endpoint
```bash
llama-stack-client datasets list
```
### `llama-stack-client datasets register`
```bash
llama-stack-client datasets register --dataset_id <dataset_id> --purpose <purpose> [--url <url] [--dataset-path <dataset-path>] [--dataset-id <dataset-id>] [--metadata <metadata>]
```
Required arguments:
- `--dataset_id`: Id of the dataset
- `--purpose`: Purpose of the dataset
Optional arguments:
- `--metadata`: Metadata of the dataset
- `--url`: URL of the dataset
- `--dataset-path`: Local file path to the dataset. If specified, upload dataset via URL
### `llama-stack-client datasets unregister`
Remove a dataset
```bash
llama-stack-client datasets unregister <dataset-id>
```
Required arguments:
- `DATASET_ID`: Id of the dataset
## Scoring Functions Management
Manage scoring functions.
### `llama-stack-client scoring_functions list`
Show available scoring functions on distribution endpoint
```bash
llama-stack-client scoring_functions list
```
```
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃ identifier ┃ provider_id ┃ description ┃ type ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ basic::bfcl │ basic │ BFCL complex scoring │ scoring_function │
│ basic::docvqa │ basic │ DocVQA Visual Question & Answer scoring function │ scoring_function │
│ basic::equality │ basic │ Returns 1.0 if the input is equal to the target, 0.0 │ scoring_function │
│ │ │ otherwise. │ │
└────────────────────────────────────────────┴──────────────┴───────────────────────────────────────────────────────────────┴──────────────────┘
```
### `llama-stack-client scoring_functions register`
Register a new scoring function
```bash
llama-stack-client scoring_functions register --scoring-fn-id <scoring-fn-id> --description <description> --return-type <return-type> [--provider-id <provider-id>] [--provider-scoring-fn-id <provider-scoring-fn-id>] [--params <params>]
```
Required arguments:
- `--scoring-fn-id`: Id of the scoring function
- `--description`: Description of the scoring function
- `--return-type`: Return type of the scoring function
Optional arguments:
- `--provider-id`: Provider ID for the scoring function
- `--provider-scoring-fn-id`: Provider's scoring function ID
- `--params`: Parameters for the scoring function in JSON format
## Post Training Management
Post-training.
### `llama-stack-client post_training list`
Show the list of available post training jobs
```bash
llama-stack-client post_training list
```
```bash
["job-1", "job-2", "job-3"]
```
### `llama-stack-client post_training artifacts`
Get the training artifacts of a specific post training job
```bash
llama-stack-client post_training artifacts --job-uuid <job-uuid>
```
```bash
JobArtifactsResponse(checkpoints=[], job_uuid='job-1')
```
Required arguments:
- `--job-uuid`: Job UUID
### `llama-stack-client post_training supervised_fine_tune`
Kick off a supervised fine tune job
```bash
llama-stack-client post_training supervised_fine_tune --job-uuid <job-uuid> --model <model> --algorithm-config <algorithm-config> --training-config <training-config> [--checkpoint-dir <checkpoint-dir>]
```
Required arguments:
- `--job-uuid`: Job UUID
- `--model`: Model ID
- `--algorithm-config`: Algorithm Config
- `--training-config`: Training Config
Optional arguments:
- `--checkpoint-dir`: Checkpoint Config
### `llama-stack-client post_training status`
Show the status of a specific post training job
```bash
llama-stack-client post_training status --job-uuid <job-uuid>
```
```bash
JobStatusResponse(
checkpoints=[],
job_uuid='job-1',
status='completed',
completed_at="",
resources_allocated="",
scheduled_at="",
started_at=""
)
```
Required arguments:
- `--job-uuid`: Job UUID
### `llama-stack-client post_training cancel`
Cancel the training job
```bash
llama-stack-client post_training cancel --job-uuid <job-uuid>
```
```bash
# This functionality is not yet implemented for llama-stack-client
╭────────────────────────────────────────────────────────────╮
│ Failed to post_training cancel_training_job │
│ │
│ Error Type: InternalServerError │
│ Details: Error code: 501 - {'detail': 'Not implemented: '} │
╰────────────────────────────────────────────────────────────╯
```
Required arguments:
- `--job-uuid`: Job UUID

View file

@ -19,8 +19,10 @@ class PostTrainingMetric(BaseModel):
perplexity: float
@json_schema_type(schema={"description": "Checkpoint created during training runs"})
@json_schema_type
class Checkpoint(BaseModel):
"""Checkpoint created during training runs"""
identifier: str
created_at: datetime
epoch: int

View file

@ -7,7 +7,7 @@
from enum import StrEnum
from typing import Any, Literal, Protocol, runtime_checkable
from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, ConfigDict, Field, field_validator
from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -36,13 +36,21 @@ class Model(CommonModelFields, Resource):
return self.identifier
@property
def provider_model_id(self) -> str | None:
def provider_model_id(self) -> str:
assert self.provider_resource_id is not None, "Provider resource ID must be set"
return self.provider_resource_id
model_config = ConfigDict(protected_namespaces=())
model_type: ModelType = Field(default=ModelType.llm)
@field_validator("provider_resource_id")
@classmethod
def validate_provider_resource_id(cls, v):
if v is None:
raise ValueError("provider_resource_id cannot be None")
return v
class ModelInput(CommonModelFields):
model_id: str

View file

@ -87,6 +87,20 @@ class RAGQueryGenerator(Enum):
custom = "custom"
@json_schema_type
class RAGSearchMode(Enum):
"""
Search modes for RAG query retrieval:
- VECTOR: Uses vector similarity search for semantic matching
- KEYWORD: Uses keyword-based search for exact matching
- HYBRID: Combines both vector and keyword search for better results
"""
VECTOR = "vector"
KEYWORD = "keyword"
HYBRID = "hybrid"
@json_schema_type
class DefaultRAGQueryGeneratorConfig(BaseModel):
type: Literal["default"] = "default"
@ -128,7 +142,7 @@ class RAGQueryConfig(BaseModel):
max_tokens_in_context: int = 4096
max_chunks: int = 5
chunk_template: str = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n"
mode: str | None = None
mode: RAGSearchMode | None = RAGSearchMode.VECTOR
ranker: Ranker | None = Field(default=None) # Only used for hybrid mode
@field_validator("chunk_template")

View file

@ -19,6 +19,7 @@ class VectorDB(Resource):
embedding_model: str
embedding_dimension: int
vector_db_name: str | None = None
@property
def vector_db_id(self) -> str:
@ -70,6 +71,7 @@ class VectorDBs(Protocol):
embedding_model: str,
embedding_dimension: int | None = 384,
provider_id: str | None = None,
vector_db_name: str | None = None,
provider_vector_db_id: str | None = None,
) -> VectorDB:
"""Register a vector database.
@ -78,6 +80,7 @@ class VectorDBs(Protocol):
:param embedding_model: The embedding model to use.
:param embedding_dimension: The dimension of the embedding model.
:param provider_id: The identifier of the provider.
:param vector_db_name: The name of the vector database.
:param provider_vector_db_id: The identifier of the vector database in the provider.
:returns: A VectorDB.
"""

View file

@ -346,7 +346,6 @@ class VectorIO(Protocol):
embedding_model: str | None = None,
embedding_dimension: int | None = 384,
provider_id: str | None = None,
provider_vector_db_id: str | None = None,
) -> VectorStoreObject:
"""Creates a vector store.
@ -358,7 +357,6 @@ class VectorIO(Protocol):
:param embedding_model: The embedding model to use for this vector store.
:param embedding_dimension: The dimension of the embedding vectors (default: 384).
:param provider_id: The ID of the provider to use for this vector store.
:param provider_vector_db_id: The provider-specific vector database ID.
:returns: A VectorStoreObject representing the created vector store.
"""
...

View file

@ -93,7 +93,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
)
sys.exit(1)
elif args.providers:
providers = dict()
providers_list: dict[str, str | list[str]] = dict()
for api_provider in args.providers.split(","):
if "=" not in api_provider:
cprint(
@ -112,7 +112,15 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
)
sys.exit(1)
if provider in providers_for_api:
providers.setdefault(api, []).append(provider)
if api not in providers_list:
providers_list[api] = []
# Use type guarding to ensure we have a list
provider_value = providers_list[api]
if isinstance(provider_value, list):
provider_value.append(provider)
else:
# Convert string to list and append
providers_list[api] = [provider_value, provider]
else:
cprint(
f"{provider} is not a valid provider for the {api} API.",
@ -121,7 +129,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
)
sys.exit(1)
distribution_spec = DistributionSpec(
providers=providers,
providers=providers_list,
description=",".join(args.providers),
)
if not args.image_type:
@ -182,7 +190,7 @@ def run_stack_build_command(args: argparse.Namespace) -> None:
cprint("Tip: use <TAB> to see options for the providers.\n", color="green", file=sys.stderr)
providers = dict()
providers: dict[str, str | list[str]] = dict()
for api, providers_for_api in get_provider_registry().items():
available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
if not available_providers:
@ -371,10 +379,16 @@ def _run_stack_build_command_from_build_config(
if not image_name:
raise ValueError("Please specify an image name when building a venv image")
# At this point, image_name should be guaranteed to be a string
if image_name is None:
raise ValueError("image_name should not be None after validation")
if template_name:
build_dir = DISTRIBS_BASE_DIR / template_name
build_file_path = build_dir / f"{template_name}-build.yaml"
else:
if image_name is None:
raise ValueError("image_name cannot be None")
build_dir = DISTRIBS_BASE_DIR / image_name
build_file_path = build_dir / f"{image_name}-build.yaml"
@ -395,7 +409,7 @@ def _run_stack_build_command_from_build_config(
build_file_path,
image_name,
template_or_config=template_name or config_path or str(build_file_path),
run_config=run_config_file,
run_config=run_config_file.as_posix() if run_config_file else None,
)
if return_code != 0:
raise RuntimeError(f"Failed to build image {image_name}")
@ -403,15 +417,16 @@ def _run_stack_build_command_from_build_config(
if template_name:
# copy run.yaml from template to build_dir instead of generating it again
template_path = importlib.resources.files("llama_stack") / f"templates/{template_name}/run.yaml"
run_config_file = build_dir / f"{template_name}-run.yaml"
with importlib.resources.as_file(template_path) as path:
run_config_file = build_dir / f"{template_name}-run.yaml"
shutil.copy(path, run_config_file)
cprint("Build Successful!", color="green", file=sys.stderr)
cprint(f"You can find the newly-built template here: {template_path}", color="blue", file=sys.stderr)
cprint(f"You can find the newly-built template here: {run_config_file}", color="blue", file=sys.stderr)
cprint(
"You can run the new Llama Stack distro via: "
+ colored(f"llama stack run {template_path} --image-type {build_config.image_type}", "blue"),
+ colored(f"llama stack run {run_config_file} --image-type {build_config.image_type}", "blue"),
color="green",
file=sys.stderr,
)

View file

@ -47,8 +47,7 @@ class StackRun(Subcommand):
self.parser.add_argument(
"--image-name",
type=str,
default=os.environ.get("CONDA_DEFAULT_ENV"),
help="Name of the image to run. Defaults to the current environment",
help="Name of the image to run.",
)
self.parser.add_argument(
"--env",
@ -83,46 +82,57 @@ class StackRun(Subcommand):
return ImageType.CONDA.value, args.image_name
return args.image_type, args.image_name
def _resolve_config_and_template(self, args: argparse.Namespace) -> tuple[Path | None, str | None]:
"""Resolve config file path and template name from args.config"""
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
if not args.config:
return None, None
config_file = Path(args.config)
has_yaml_suffix = args.config.endswith(".yaml")
template_name = None
if not config_file.exists() and not has_yaml_suffix:
# check if this is a template
config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
if config_file.exists():
template_name = args.config
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir
config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
if not config_file.exists():
self.parser.error(
f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
)
if not config_file.is_file():
self.parser.error(
f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
)
return config_file, template_name
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
import yaml
from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.distribution.utils.exec import formulate_run_args, run_command
if args.enable_ui:
self._start_ui_development_server(args.port)
image_type, image_name = self._get_image_type_and_name(args)
# Resolve config file and template name first
config_file, template_name = self._resolve_config_and_template(args)
# Check if config is required based on image type
if (image_type in [ImageType.CONDA.value, ImageType.VENV.value]) and not args.config:
if (image_type in [ImageType.CONDA.value, ImageType.VENV.value]) and not config_file:
self.parser.error("Config file is required for venv and conda environments")
if args.config:
config_file = Path(args.config)
has_yaml_suffix = args.config.endswith(".yaml")
template_name = None
if not config_file.exists() and not has_yaml_suffix:
# check if this is a template
config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
if config_file.exists():
template_name = args.config
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir
config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
if not config_file.exists():
self.parser.error(
f"File {str(config_file)} does not exist.\n\nPlease run `llama stack build` to generate (and optionally edit) a run.yaml file"
)
if not config_file.is_file():
self.parser.error(
f"Config file must be a valid file path, '{config_file}' is not a file: type={type(config_file)}"
)
if config_file:
logger.info(f"Using run configuration: {config_file}")
try:
@ -138,8 +148,6 @@ class StackRun(Subcommand):
self.parser.error(f"failed to parse config file '{config_file}':\n {e}")
else:
config = None
config_file = None
template_name = None
# If neither image type nor image name is provided, assume the server should be run directly
# using the current environment packages.
@ -155,8 +163,12 @@ class StackRun(Subcommand):
# func=<bound method StackRun._run_stack_run_cmd of <llama_stack.cli.stack.run.StackRun object at 0x10484b010>>
if callable(getattr(args, arg)):
continue
if arg == "config" and template_name:
server_args.config = str(config_file)
if arg == "config":
if template_name:
server_args.template = str(template_name)
else:
# Set the config file path
server_args.config = str(config_file)
else:
setattr(server_args, arg, getattr(args, arg))

View file

@ -81,7 +81,7 @@ def is_action_allowed(
if not len(policy):
policy = default_policy()
qualified_resource_id = resource.type + "::" + resource.identifier
qualified_resource_id = f"{resource.type}::{resource.identifier}"
for rule in policy:
if rule.forbid and matches_scope(rule.forbid, action, qualified_resource_id, user.principal):
if rule.when:

View file

@ -96,7 +96,7 @@ FROM $container_base
WORKDIR /app
# We install the Python 3.12 dev headers and build tools so that any
# Cextension wheels (e.g. polyleven, faisscpu) can compile successfully.
# C-extension wheels (e.g. polyleven, faiss-cpu) can compile successfully.
RUN dnf -y update && dnf install -y iputils git net-tools wget \
vim-minimal python3.12 python3.12-pip python3.12-wheel \
@ -169,7 +169,7 @@ if [ -n "$run_config" ]; then
echo "Copying external providers directory: $external_providers_dir"
cp -r "$external_providers_dir" "$BUILD_CONTEXT_DIR/providers.d"
add_to_container << EOF
COPY --chmod=g+w providers.d /.llama/providers.d
COPY providers.d /.llama/providers.d
EOF
fi

View file

@ -17,7 +17,7 @@ from llama_stack.distribution.distribution import (
builtin_automatically_routed_apis,
get_provider_registry,
)
from llama_stack.distribution.stack import replace_env_vars
from llama_stack.distribution.stack import cast_image_name_to_string, replace_env_vars
from llama_stack.distribution.utils.config_dirs import EXTERNAL_PROVIDERS_DIR
from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.distribution.utils.prompt_for_config import prompt_for_config
@ -164,7 +164,8 @@ def upgrade_from_routing_table(
def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfig:
version = config_dict.get("version", None)
if version == LLAMA_STACK_RUN_CONFIG_VERSION:
return StackRunConfig(**replace_env_vars(config_dict))
processed_config_dict = replace_env_vars(config_dict)
return StackRunConfig(**cast_image_name_to_string(processed_config_dict))
if "routing_table" in config_dict:
logger.info("Upgrading config...")
@ -175,4 +176,5 @@ def parse_and_maybe_upgrade_config(config_dict: dict[str, Any]) -> StackRunConfi
if not config_dict.get("external_providers_dir", None):
config_dict["external_providers_dir"] = EXTERNAL_PROVIDERS_DIR
return StackRunConfig(**replace_env_vars(config_dict))
processed_config_dict = replace_env_vars(config_dict)
return StackRunConfig(**cast_image_name_to_string(processed_config_dict))

View file

@ -6,9 +6,9 @@
from enum import StrEnum
from pathlib import Path
from typing import Annotated, Any
from typing import Annotated, Any, Literal, Self
from pydantic import BaseModel, Field, field_validator
from pydantic import BaseModel, Field, field_validator, model_validator
from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput
from llama_stack.apis.datasetio import DatasetIO
@ -161,23 +161,113 @@ class LoggingConfig(BaseModel):
)
class OAuth2JWKSConfig(BaseModel):
# The JWKS URI for collecting public keys
uri: str
token: str | None = Field(default=None, description="token to authorise access to jwks")
key_recheck_period: int = Field(default=3600, description="The period to recheck the JWKS URI for key updates")
class OAuth2IntrospectionConfig(BaseModel):
url: str
client_id: str
client_secret: str
send_secret_in_body: bool = False
class AuthProviderType(StrEnum):
"""Supported authentication provider types."""
OAUTH2_TOKEN = "oauth2_token"
GITHUB_TOKEN = "github_token"
CUSTOM = "custom"
class OAuth2TokenAuthConfig(BaseModel):
"""Configuration for OAuth2 token authentication."""
type: Literal[AuthProviderType.OAUTH2_TOKEN] = AuthProviderType.OAUTH2_TOKEN
audience: str = Field(default="llama-stack")
verify_tls: bool = Field(default=True)
tls_cafile: Path | None = Field(default=None)
issuer: str | None = Field(default=None, description="The OIDC issuer URL.")
claims_mapping: dict[str, str] = Field(
default_factory=lambda: {
"sub": "roles",
"username": "roles",
"groups": "teams",
"team": "teams",
"project": "projects",
"tenant": "namespaces",
"namespace": "namespaces",
},
)
jwks: OAuth2JWKSConfig | None = Field(default=None, description="JWKS configuration")
introspection: OAuth2IntrospectionConfig | None = Field(
default=None, description="OAuth2 introspection configuration"
)
@classmethod
@field_validator("claims_mapping")
def validate_claims_mapping(cls, v):
for key, value in v.items():
if not value:
raise ValueError(f"claims_mapping value cannot be empty: {key}")
return v
@model_validator(mode="after")
def validate_mode(self) -> Self:
if not self.jwks and not self.introspection:
raise ValueError("One of jwks or introspection must be configured")
if self.jwks and self.introspection:
raise ValueError("At present only one of jwks or introspection should be configured")
return self
class CustomAuthConfig(BaseModel):
"""Configuration for custom authentication."""
type: Literal[AuthProviderType.CUSTOM] = AuthProviderType.CUSTOM
endpoint: str = Field(
...,
description="Custom authentication endpoint URL",
)
class GitHubTokenAuthConfig(BaseModel):
"""Configuration for GitHub token authentication."""
type: Literal[AuthProviderType.GITHUB_TOKEN] = AuthProviderType.GITHUB_TOKEN
github_api_base_url: str = Field(
default="https://api.github.com",
description="Base URL for GitHub API (use https://api.github.com for public GitHub)",
)
claims_mapping: dict[str, str] = Field(
default_factory=lambda: {
"login": "roles",
"organizations": "teams",
},
description="Mapping from GitHub user fields to access attributes",
)
AuthProviderConfig = Annotated[
OAuth2TokenAuthConfig | GitHubTokenAuthConfig | CustomAuthConfig,
Field(discriminator="type"),
]
class AuthenticationConfig(BaseModel):
provider_type: AuthProviderType = Field(
"""Top-level authentication configuration."""
provider_config: AuthProviderConfig = Field(
...,
description="Type of authentication provider",
description="Authentication provider configuration",
)
config: dict[str, Any] = Field(
...,
description="Provider-specific configuration",
access_policy: list[AccessRule] = Field(
default=[],
description="Rules for determining access to resources",
)
access_policy: list[AccessRule] = Field(default=[], description="Rules for determining access to resources")
class AuthenticationRequiredError(Exception):

View file

@ -200,7 +200,7 @@ def validate_and_prepare_providers(
specs = {}
for provider in providers:
if not provider.provider_id or provider.provider_id == "__disabled__":
logger.warning(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
logger.debug(f"Provider `{provider.provider_type}` for API `{api}` is disabled")
continue
validate_provider(provider, api, provider_registry)

View file

@ -5,6 +5,7 @@
# the root directory of this source tree.
import asyncio
import uuid
from typing import Any
from llama_stack.apis.common.content_types import (
@ -81,6 +82,7 @@ class VectorIORouter(VectorIO):
embedding_model: str,
embedding_dimension: int | None = 384,
provider_id: str | None = None,
vector_db_name: str | None = None,
provider_vector_db_id: str | None = None,
) -> None:
logger.debug(f"VectorIORouter.register_vector_db: {vector_db_id}, {embedding_model}")
@ -89,6 +91,7 @@ class VectorIORouter(VectorIO):
embedding_model,
embedding_dimension,
provider_id,
vector_db_name,
provider_vector_db_id,
)
@ -123,7 +126,6 @@ class VectorIORouter(VectorIO):
embedding_model: str | None = None,
embedding_dimension: int | None = None,
provider_id: str | None = None,
provider_vector_db_id: str | None = None,
) -> VectorStoreObject:
logger.debug(f"VectorIORouter.openai_create_vector_store: name={name}, provider_id={provider_id}")
@ -135,17 +137,17 @@ class VectorIORouter(VectorIO):
embedding_model, embedding_dimension = embedding_model_info
logger.info(f"No embedding model specified, using first available: {embedding_model}")
vector_db_id = name
vector_db_id = f"vs_{uuid.uuid4()}"
registered_vector_db = await self.routing_table.register_vector_db(
vector_db_id,
embedding_model,
embedding_dimension,
provider_id,
provider_vector_db_id,
vector_db_id=vector_db_id,
embedding_model=embedding_model,
embedding_dimension=embedding_dimension,
provider_id=provider_id,
provider_vector_db_id=vector_db_id,
vector_db_name=name,
)
return await self.routing_table.get_provider_impl(registered_vector_db.identifier).openai_create_vector_store(
vector_db_id,
name=name,
file_ids=file_ids,
expires_after=expires_after,
chunking_strategy=chunking_strategy,

View file

@ -36,6 +36,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
embedding_dimension: int | None = 384,
provider_id: str | None = None,
provider_vector_db_id: str | None = None,
vector_db_name: str | None = None,
) -> VectorDB:
if provider_vector_db_id is None:
provider_vector_db_id = vector_db_id
@ -62,6 +63,7 @@ class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
"provider_resource_id": provider_vector_db_id,
"embedding_model": embedding_model,
"embedding_dimension": model.metadata["embedding_dimension"],
"vector_db_name": vector_db_name,
}
vector_db = TypeAdapter(VectorDBWithOwner).validate_python(vector_db_data)
await self.register_object(vector_db)

View file

@ -87,8 +87,12 @@ class AuthenticationMiddleware:
headers = dict(scope.get("headers", []))
auth_header = headers.get(b"authorization", b"").decode()
if not auth_header or not auth_header.startswith("Bearer "):
return await self._send_auth_error(send, "Missing or invalid Authorization header")
if not auth_header:
error_msg = self.auth_provider.get_auth_error_message(scope)
return await self._send_auth_error(send, error_msg)
if not auth_header.startswith("Bearer "):
return await self._send_auth_error(send, "Invalid Authorization header format")
token = auth_header.split("Bearer ", 1)[1]

View file

@ -8,15 +8,19 @@ import ssl
import time
from abc import ABC, abstractmethod
from asyncio import Lock
from pathlib import Path
from typing import Self
from urllib.parse import parse_qs
from urllib.parse import parse_qs, urlparse
import httpx
from jose import jwt
from pydantic import BaseModel, Field, field_validator, model_validator
from pydantic import BaseModel, Field
from llama_stack.distribution.datatypes import AuthenticationConfig, AuthProviderType, User
from llama_stack.distribution.datatypes import (
AuthenticationConfig,
CustomAuthConfig,
GitHubTokenAuthConfig,
OAuth2TokenAuthConfig,
User,
)
from llama_stack.log import get_logger
logger = get_logger(name=__name__, category="auth")
@ -38,9 +42,7 @@ class AuthRequestContext(BaseModel):
headers: dict[str, str] = Field(description="HTTP headers from the original request (excluding Authorization)")
params: dict[str, list[str]] = Field(
description="Query parameters from the original request, parsed as dictionary of lists"
)
params: dict[str, list[str]] = Field(default_factory=dict, description="Query parameters from the original request")
class AuthRequest(BaseModel):
@ -62,6 +64,10 @@ class AuthProvider(ABC):
"""Clean up any resources."""
pass
def get_auth_error_message(self, scope: dict | None = None) -> str:
"""Return provider-specific authentication error message."""
return "Authentication required"
def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str]) -> dict[str, list[str]]:
attributes: dict[str, list[str]] = {}
@ -81,56 +87,6 @@ def get_attributes_from_claims(claims: dict[str, str], mapping: dict[str, str])
return attributes
class OAuth2JWKSConfig(BaseModel):
# The JWKS URI for collecting public keys
uri: str
token: str | None = Field(default=None, description="token to authorise access to jwks")
key_recheck_period: int = Field(default=3600, description="The period to recheck the JWKS URI for key updates")
class OAuth2IntrospectionConfig(BaseModel):
url: str
client_id: str
client_secret: str
send_secret_in_body: bool = False
class OAuth2TokenAuthProviderConfig(BaseModel):
audience: str = "llama-stack"
verify_tls: bool = True
tls_cafile: Path | None = None
issuer: str | None = Field(default=None, description="The OIDC issuer URL.")
claims_mapping: dict[str, str] = Field(
default_factory=lambda: {
"sub": "roles",
"username": "roles",
"groups": "teams",
"team": "teams",
"project": "projects",
"tenant": "namespaces",
"namespace": "namespaces",
},
)
jwks: OAuth2JWKSConfig | None
introspection: OAuth2IntrospectionConfig | None = None
@classmethod
@field_validator("claims_mapping")
def validate_claims_mapping(cls, v):
for key, value in v.items():
if not value:
raise ValueError(f"claims_mapping value cannot be empty: {key}")
return v
@model_validator(mode="after")
def validate_mode(self) -> Self:
if not self.jwks and not self.introspection:
raise ValueError("One of jwks or introspection must be configured")
if self.jwks and self.introspection:
raise ValueError("At present only one of jwks or introspection should be configured")
return self
class OAuth2TokenAuthProvider(AuthProvider):
"""
JWT token authentication provider that validates a JWT token and extracts access attributes.
@ -138,7 +94,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
This should be the standard authentication provider for most use cases.
"""
def __init__(self, config: OAuth2TokenAuthProviderConfig):
def __init__(self, config: OAuth2TokenAuthConfig):
self.config = config
self._jwks_at: float = 0.0
self._jwks: dict[str, str] = {}
@ -170,7 +126,7 @@ class OAuth2TokenAuthProvider(AuthProvider):
issuer=self.config.issuer,
)
except Exception as exc:
raise ValueError(f"Invalid JWT token: {token}") from exc
raise ValueError("Invalid JWT token") from exc
# There are other standard claims, the most relevant of which is `scope`.
# We should incorporate these into the access attributes.
@ -232,6 +188,17 @@ class OAuth2TokenAuthProvider(AuthProvider):
async def close(self):
pass
def get_auth_error_message(self, scope: dict | None = None) -> str:
"""Return OAuth2-specific authentication error message."""
if self.config.issuer:
return f"Authentication required. Please provide a valid OAuth2 Bearer token from {self.config.issuer}"
elif self.config.introspection:
# Extract domain from introspection URL for a cleaner message
domain = urlparse(self.config.introspection.url).netloc
return f"Authentication required. Please provide a valid OAuth2 Bearer token validated by {domain}"
else:
return "Authentication required. Please provide a valid OAuth2 Bearer token in the Authorization header"
async def _refresh_jwks(self) -> None:
"""
Refresh the JWKS cache.
@ -264,14 +231,10 @@ class OAuth2TokenAuthProvider(AuthProvider):
self._jwks_at = time.time()
class CustomAuthProviderConfig(BaseModel):
endpoint: str
class CustomAuthProvider(AuthProvider):
"""Custom authentication provider that uses an external endpoint."""
def __init__(self, config: CustomAuthProviderConfig):
def __init__(self, config: CustomAuthConfig):
self.config = config
self._client = None
@ -317,7 +280,7 @@ class CustomAuthProvider(AuthProvider):
try:
response_data = response.json()
auth_response = AuthResponse(**response_data)
return User(auth_response.principal, auth_response.attributes)
return User(principal=auth_response.principal, attributes=auth_response.attributes)
except Exception as e:
logger.exception("Error parsing authentication response")
raise ValueError("Invalid authentication response format") from e
@ -338,15 +301,88 @@ class CustomAuthProvider(AuthProvider):
await self._client.aclose()
self._client = None
def get_auth_error_message(self, scope: dict | None = None) -> str:
"""Return custom auth provider-specific authentication error message."""
domain = urlparse(self.config.endpoint).netloc
if domain:
return f"Authentication required. Please provide your API key as a Bearer token (validated by {domain})"
else:
return "Authentication required. Please provide your API key as a Bearer token in the Authorization header"
class GitHubTokenAuthProvider(AuthProvider):
"""
GitHub token authentication provider that validates GitHub access tokens directly.
This provider accepts GitHub personal access tokens or OAuth tokens and verifies
them against the GitHub API to get user information.
"""
def __init__(self, config: GitHubTokenAuthConfig):
self.config = config
async def validate_token(self, token: str, scope: dict | None = None) -> User:
"""Validate a GitHub token by calling the GitHub API.
This validates tokens issued by GitHub (personal access tokens or OAuth tokens).
"""
try:
user_info = await _get_github_user_info(token, self.config.github_api_base_url)
except httpx.HTTPStatusError as e:
logger.warning(f"GitHub token validation failed: {e}")
raise ValueError("GitHub token validation failed. Please check your token and try again.") from e
principal = user_info["user"]["login"]
github_data = {
"login": user_info["user"]["login"],
"id": str(user_info["user"]["id"]),
"organizations": user_info.get("organizations", []),
}
access_attributes = get_attributes_from_claims(github_data, self.config.claims_mapping)
return User(
principal=principal,
attributes=access_attributes,
)
async def close(self):
"""Clean up any resources."""
pass
def get_auth_error_message(self, scope: dict | None = None) -> str:
"""Return GitHub-specific authentication error message."""
return "Authentication required. Please provide a valid GitHub access token (https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) in the Authorization header (Bearer <token>)"
async def _get_github_user_info(access_token: str, github_api_base_url: str) -> dict:
"""Fetch user info and organizations from GitHub API."""
headers = {
"Authorization": f"Bearer {access_token}",
"Accept": "application/vnd.github.v3+json",
"User-Agent": "llama-stack",
}
async with httpx.AsyncClient() as client:
user_response = await client.get(f"{github_api_base_url}/user", headers=headers, timeout=10.0)
user_response.raise_for_status()
user_data = user_response.json()
return {
"user": user_data,
}
def create_auth_provider(config: AuthenticationConfig) -> AuthProvider:
"""Factory function to create the appropriate auth provider."""
provider_type = config.provider_type.lower()
provider_config = config.provider_config
if provider_type == "custom":
return CustomAuthProvider(CustomAuthProviderConfig.model_validate(config.config))
elif provider_type == "oauth2_token":
return OAuth2TokenAuthProvider(OAuth2TokenAuthProviderConfig.model_validate(config.config))
if isinstance(provider_config, CustomAuthConfig):
return CustomAuthProvider(provider_config)
elif isinstance(provider_config, OAuth2TokenAuthConfig):
return OAuth2TokenAuthProvider(provider_config)
elif isinstance(provider_config, GitHubTokenAuthConfig):
return GitHubTokenAuthProvider(provider_config)
else:
supported_providers = ", ".join([t.value for t in AuthProviderType])
raise ValueError(f"Unsupported auth provider type: {provider_type}. Supported types are: {supported_providers}")
raise ValueError(f"Unknown authentication provider config type: {type(provider_config)}")

View file

@ -33,7 +33,11 @@ from pydantic import BaseModel, ValidationError
from llama_stack.apis.common.responses import PaginatedResponse
from llama_stack.distribution.access_control.access_control import AccessDeniedError
from llama_stack.distribution.datatypes import AuthenticationRequiredError, LoggingConfig, StackRunConfig
from llama_stack.distribution.datatypes import (
AuthenticationRequiredError,
LoggingConfig,
StackRunConfig,
)
from llama_stack.distribution.distribution import builtin_automatically_routed_apis
from llama_stack.distribution.request_headers import PROVIDER_DATA_VAR, User, request_provider_data_context
from llama_stack.distribution.resolver import InvalidProviderError
@ -43,6 +47,7 @@ from llama_stack.distribution.server.routes import (
initialize_route_impls,
)
from llama_stack.distribution.stack import (
cast_image_name_to_string,
construct_stack,
replace_env_vars,
validate_env_pair,
@ -217,7 +222,7 @@ def create_dynamic_typed_route(func: Any, method: str, route: str) -> Callable:
# Get auth attributes from the request scope
user_attributes = request.scope.get("user_attributes", {})
principal = request.scope.get("principal", "")
user = User(principal, user_attributes)
user = User(principal=principal, attributes=user_attributes)
await log_request_pre_validation(request)
@ -405,13 +410,13 @@ def main(args: argparse.Namespace | None = None):
args = parser.parse_args()
log_line = ""
if args.config:
if hasattr(args, "config") and args.config:
# if the user provided a config file, use it, even if template was specified
config_file = Path(args.config)
if not config_file.exists():
raise ValueError(f"Config file {config_file} does not exist")
log_line = f"Using config file: {config_file}"
elif args.template:
elif hasattr(args, "template") and args.template:
config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.template / "run.yaml"
if not config_file.exists():
raise ValueError(f"Template {args.template} does not exist")
@ -435,14 +440,12 @@ def main(args: argparse.Namespace | None = None):
logger.error(f"Error: {str(e)}")
sys.exit(1)
config = replace_env_vars(config_contents)
config = StackRunConfig(**config)
config = StackRunConfig(**cast_image_name_to_string(config))
# now that the logger is initialized, print the line about which type of config we are using.
logger.info(log_line)
logger.info("Run configuration:")
safe_config = redact_sensitive_fields(config.model_dump())
logger.info(yaml.dump(safe_config, indent=2))
_log_run_config(run_config=config)
app = FastAPI(
lifespan=lifespan,
@ -450,12 +453,13 @@ def main(args: argparse.Namespace | None = None):
redoc_url="/redoc",
openapi_url="/openapi.json",
)
if not os.environ.get("LLAMA_STACK_DISABLE_VERSION_CHECK"):
app.add_middleware(ClientVersionMiddleware)
# Add authentication middleware if configured
if config.server.auth:
logger.info(f"Enabling authentication with provider: {config.server.auth.provider_type.value}")
logger.info(f"Enabling authentication with provider: {config.server.auth.provider_config.type.value}")
app.add_middleware(AuthenticationMiddleware, auth_config=config.server.auth)
else:
if config.server.quota:
@ -488,7 +492,13 @@ def main(args: argparse.Namespace | None = None):
)
try:
impls = asyncio.run(construct_stack(config))
# Create and set the event loop that will be used for both construction and server runtime
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Construct the stack in the persistent event loop
impls = loop.run_until_complete(construct_stack(config))
except InvalidProviderError as e:
logger.error(f"Error: {str(e)}")
sys.exit(1)
@ -586,7 +596,16 @@ def main(args: argparse.Namespace | None = None):
if ssl_config:
uvicorn_config.update(ssl_config)
uvicorn.run(**uvicorn_config)
# Run uvicorn in the existing event loop to preserve background tasks
loop.run_until_complete(uvicorn.Server(uvicorn.Config(**uvicorn_config)).serve())
def _log_run_config(run_config: StackRunConfig):
"""Logs the run config with redacted fields and disabled providers removed."""
logger.info("Run configuration:")
safe_config = redact_sensitive_fields(run_config.model_dump(mode="json"))
clean_config = remove_disabled_providers(safe_config)
logger.info(yaml.dump(clean_config, indent=2))
def extract_path_params(route: str) -> list[str]:
@ -597,5 +616,20 @@ def extract_path_params(route: str) -> list[str]:
return params
def remove_disabled_providers(obj):
if isinstance(obj, dict):
if (
obj.get("provider_id") == "__disabled__"
or obj.get("shield_id") == "__disabled__"
or obj.get("provider_model_id") == "__disabled__"
):
return None
return {k: v for k, v in ((k, remove_disabled_providers(v)) for k, v in obj.items()) if v is not None}
elif isinstance(obj, list):
return [item for item in (remove_disabled_providers(i) for i in obj) if item is not None]
else:
return obj
if __name__ == "__main__":
main()

View file

@ -98,6 +98,7 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
method = getattr(impls[api], register_method)
for obj in objects:
logger.debug(f"registering {rsrc.capitalize()} {obj} for provider {obj.provider_id}")
# Do not register models on disabled providers
if hasattr(obj, "provider_id") and obj.provider_id is not None and obj.provider_id == "__disabled__":
logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.")
@ -112,6 +113,11 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]):
):
logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled model.")
continue
if hasattr(obj, "shield_id") and obj.shield_id is not None and obj.shield_id == "__disabled__":
logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled shield.")
continue
# we want to maintain the type information in arguments to method.
# instead of method(**obj.model_dump()), which may convert a typed attr to a dict,
# we use model_dump() to find all the attrs and then getattr to get the still typed value.
@ -166,7 +172,6 @@ def replace_env_vars(config: Any, path: str = "") -> Any:
# Create a copy with resolved provider_id but original config
disabled_provider = v.copy()
disabled_provider["provider_id"] = resolved_provider_id
result.append(disabled_provider)
continue
except EnvVarError:
# If we can't resolve the provider_id, continue with normal processing
@ -261,6 +266,13 @@ def _convert_string_to_proper_type(value: str) -> Any:
return value
def cast_image_name_to_string(config_dict: dict[str, Any]) -> dict[str, Any]:
"""Ensure that any value for a key 'image_name' in a config_dict is a string"""
if "image_name" in config_dict and config_dict["image_name"] is not None:
config_dict["image_name"] = str(config_dict["image_name"])
return config_dict
def validate_env_pair(env_pair: str) -> tuple[str, str]:
"""Validate and split an environment variable key-value pair."""
try:

View file

@ -6,12 +6,9 @@
from collections.abc import AsyncGenerator
from contextvars import ContextVar
from typing import TypeVar
T = TypeVar("T")
def preserve_contexts_async_generator(
def preserve_contexts_async_generator[T](
gen: AsyncGenerator[T, None], context_vars: list[ContextVar]
) -> AsyncGenerator[T, None]:
"""

View file

@ -8,6 +8,7 @@ import io
import json
import uuid
from dataclasses import dataclass
from typing import Any
from PIL import Image as PIL_Image
@ -184,16 +185,26 @@ class ChatFormat:
content = content[: -len("<|eom_id|>")]
stop_reason = StopReason.end_of_message
tool_name = None
tool_arguments = {}
tool_name: str | BuiltinTool | None = None
tool_arguments: dict[str, Any] = {}
custom_tool_info = ToolUtils.maybe_extract_custom_tool_call(content)
if custom_tool_info is not None:
tool_name, tool_arguments = custom_tool_info
# Type guard: ensure custom_tool_info is a tuple of correct types
if isinstance(custom_tool_info, tuple) and len(custom_tool_info) == 2:
extracted_tool_name, extracted_tool_arguments = custom_tool_info
# Handle both dict and str return types from the function
if isinstance(extracted_tool_arguments, dict):
tool_name, tool_arguments = extracted_tool_name, extracted_tool_arguments
else:
# If it's a string, treat it as a query parameter
tool_name, tool_arguments = extracted_tool_name, {"query": extracted_tool_arguments}
else:
tool_name, tool_arguments = None, {}
# Sometimes when agent has custom tools alongside builin tools
# Agent responds for builtin tool calls in the format of the custom tools
# This code tries to handle that case
if tool_name in BuiltinTool.__members__:
if tool_name is not None and tool_name in BuiltinTool.__members__:
tool_name = BuiltinTool[tool_name]
if isinstance(tool_arguments, dict):
tool_arguments = {

View file

@ -178,6 +178,7 @@ def usecases() -> list[UseCase | str]:
),
RawMessage(role="user", content="What is the 100th decimal of pi?"),
RawMessage(
role="assistant",
content="",
stop_reason=StopReason.end_of_message,
tool_calls=[

View file

@ -24,8 +24,8 @@ class ShieldRunnerMixin:
def __init__(
self,
safety_api: Safety,
input_shields: list[str] = None,
output_shields: list[str] = None,
input_shields: list[str] | None = None,
output_shields: list[str] | None = None,
):
self.safety_api = safety_api
self.input_shields = input_shields
@ -37,6 +37,7 @@ class ShieldRunnerMixin:
return await self.safety_api.run_shield(
shield_id=identifier,
messages=messages,
params={},
)
responses = await asyncio.gather(*[run_shield_with_span(identifier) for identifier in identifiers])

Some files were not shown because too many files have changed in this diff Show more