Compare commits

..

No commits in common. "kvant" and "v0.0.55" have entirely different histories.

1164 changed files with 37765 additions and 499274 deletions

View file

@ -1,6 +0,0 @@
[run]
omit =
*/tests/*
*/llama_stack/providers/*
*/llama_stack/templates/*
.venv/*

31
.flake8 Normal file
View file

@ -0,0 +1,31 @@
[flake8]
# Suggested config from pytorch that we can adapt
select = B,C,E,F,N,P,T4,W,B9,TOR0,TOR1,TOR2
max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
# N812 ignored because import torch.nn.functional as F is PyTorch convention
# N817 ignored because importing using acronyms is convention (DistributedDataParallel as DDP)
# E731 allow usage of assigning lambda expressions
# E701 let black auto-format statements on one line
# E704 let black auto-format statements on one line
ignore =
E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,E701,E704
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
# random naming hints don't need
N802,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B950
optional-ascii-coding = True
exclude =
./.git,
./docs/*,
./build,
./scripts,
./venv,
*.pyi,
.pre-commit-config.yaml,
*.md,
.flake8

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham

View file

@ -1,6 +1,6 @@
name: 🐛 Bug Report
description: Create a report to help us reproduce and fix the bug
labels: ["bug"]
body:
- type: markdown
attributes:

View file

@ -1,12 +0,0 @@
blank_issues_enabled: false
contact_links:
- name: Have you read the docs?
url: https://llama-stack.readthedocs.io/en/latest/index.html
about: Much help can be found in the docs
- name: Start a discussion
url: https://github.com/meta-llama/llama-stack/discussions/new
about: Start a discussion on a topic
- name: Chat on Discord
url: https://discord.gg/llama-stack
about: Maybe chatting with the community can help

View file

@ -1,6 +1,6 @@
name: 🚀 Feature request
description: Request a new llama-stack feature
labels: ["enhancement"]
body:
- type: textarea
id: feature-pitch

View file

@ -1,8 +1,27 @@
# What does this PR do?
<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->
<!-- If resolving an issue, uncomment and update the line below -->
<!-- Closes #[issue-number] -->
In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue.
- [ ] Addresses issue (#issue)
## Test Plan
<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
Please describe:
- tests you ran to verify your changes with result summaries.
- provide instructions so it can be reproduced.
## Sources
Please link relevant resources if necessary.
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

2
.github/TRIAGERS.md vendored
View file

@ -1,2 +0,0 @@
# This file documents Triage members in the Llama Stack community
@bbrowning @booxter @franciscojavierarceo @leseb

View file

@ -1,26 +0,0 @@
name: Setup Ollama
description: Start Ollama and cache model
inputs:
models:
description: Comma-separated list of models to pull
default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
runs:
using: "composite"
steps:
- name: Install and start Ollama
shell: bash
run: |
# the ollama installer also starts the ollama service
curl -fsSL https://ollama.com/install.sh | sh
# Do NOT cache models - pulling the cache is actually slower than just pulling the model.
# It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
# pull them directly.
# Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
- name: Pull requested models
if: inputs.models != ''
shell: bash
run: |
for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
ollama pull "$model"
done

View file

@ -1,22 +0,0 @@
name: Setup runner
description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
runs:
using: "composite"
steps:
- name: Install uv
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
with:
python-version: "3.10"
activate-environment: true
version: 0.7.6
- name: Install dependencies
shell: bash
run: |
uv sync --all-groups
uv pip install ollama faiss-cpu
# always test against the latest version of the client
# TODO: this is not necessarily a good idea. we need to test against both published and latest
# to find out backwards compatibility issues.
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
uv pip install -e .

View file

@ -1,23 +0,0 @@
# GitHub Dependabot configuration
version: 2
updates:
# Enable version updates for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/" # Will use the default workflow location of `.github/workflows`
schedule:
interval: "weekly"
day: "saturday"
commit-message:
prefix: chore(github-deps)
- package-ecosystem: "uv"
directory: "/"
schedule:
interval: "weekly"
day: "saturday"
# ignore all non-security updates: https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#open-pull-requests-limit
open-pull-requests-limit: 0
labels:
- type/dependencies
- python
commit-message:
prefix: chore(python-deps)

View file

@ -1 +0,0 @@
FROM localhost:5000/distribution-kvant:dev

View file

@ -1,73 +0,0 @@
name: Build and Push playground container
run-name: Build and Push playground container
on:
workflow_dispatch:
#schedule:
# - cron: "0 10 * * *"
push:
branches:
- main
- kvant
tags:
- 'v*'
pull_request:
branches:
- main
- kvant
env:
IMAGE: git.kvant.cloud/${{github.repository}}-playground
jobs:
build-playground:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set current time
uses: https://github.com/gerred/actions/current-time@master
id: current_time
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to git.kvant.cloud registry
uses: docker/login-action@v3
with:
registry: git.kvant.cloud
username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
# list of Docker images to use as base name for tags
images: |
${{env.IMAGE}}
# generate Docker tags based on the following events/attributes
tags: |
type=schedule
type=ref,event=branch
type=ref,event=pr
type=ref,event=tag
type=semver,pattern={{version}}
- name: Build and push to gitea registry
uses: docker/build-push-action@v6
with:
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
context: .
file: llama_stack/distribution/ui/Containerfile
provenance: mode=max
sbom: true
build-args: |
BUILD_DATE=${{ steps.current_time.outputs.time }}
cache-from: |
type=registry,ref=${{ env.IMAGE }}:buildcache
type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
type=registry,ref=${{ env.IMAGE }}:main
cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true

View file

@ -1,98 +0,0 @@
name: Build and Push container
run-name: Build and Push container
on:
workflow_dispatch:
#schedule:
# - cron: "0 10 * * *"
push:
branches:
- main
- kvant
tags:
- 'v*'
pull_request:
branches:
- main
- kvant
env:
IMAGE: git.kvant.cloud/${{github.repository}}
jobs:
build:
runs-on: ubuntu-latest
services:
registry:
image: registry:2
ports:
- 5000:5000
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set current time
uses: https://github.com/gerred/actions/current-time@master
id: current_time
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
driver-opts: network=host
- name: Login to git.kvant.cloud registry
uses: docker/login-action@v3
with:
registry: git.kvant.cloud
username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
# list of Docker images to use as base name for tags
images: |
${{env.IMAGE}}
# generate Docker tags based on the following events/attributes
tags: |
type=schedule
type=ref,event=branch
type=ref,event=pr
type=ref,event=tag
type=semver,pattern={{version}}
- name: Install uv
uses: https://github.com/astral-sh/setup-uv@v5
with:
# Install a specific version of uv.
version: "0.7.8"
- name: Build
env:
USE_COPY_NOT_MOUNT: true
LLAMA_STACK_DIR: .
run: |
uvx --from . llama stack build --template kvant --image-type container
# docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant
# docker push ${{env.IMAGE}}:kvant
docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev
docker push localhost:5000/distribution-kvant:dev
- name: Build and push to gitea registry
uses: docker/build-push-action@v6
with:
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
context: .github/workflows
provenance: mode=max
sbom: true
build-args: |
BUILD_DATE=${{ steps.current_time.outputs.time }}
cache-from: |
type=registry,ref=${{ env.IMAGE }}:buildcache
type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
type=registry,ref=${{ env.IMAGE }}:main
cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true

25
.github/workflows/pre-commit.yml vendored Normal file
View file

@ -0,0 +1,25 @@
name: Pre-commit
on:
pull_request:
push:
branches: [main]
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
- name: Set up Python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
with:
python-version: '3.11'
cache: pip
cache-dependency-path: |
**/requirements*.txt
.pre-commit-config.yaml
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1

View file

@ -1,29 +0,0 @@
name: Update Changelog
on:
release:
types: [published, unpublished, created, edited, deleted, released]
permissions:
contents: read
jobs:
generate_changelog:
name: Generate changelog
permissions:
contents: write # for peter-evans/create-pull-request to create branch
pull-requests: write # for peter-evans/create-pull-request to create a PR
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: main
fetch-depth: 0
- run: |
python ./scripts/gen-changelog.py
- uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
with:
title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
branch: create-pull-request/changelog
signoff: true

View file

@ -1,355 +0,0 @@
name: "Run Llama-stack Tests"
on:
#### Temporarily disable PR runs until tests run as intended within mainline.
#TODO Add this back.
#pull_request_target:
# types: ["opened"]
# branches:
# - 'main'
# paths:
# - 'llama_stack/**/*.py'
# - 'tests/**/*.py'
workflow_dispatch:
inputs:
runner:
description: 'GHA Runner Scale Set label to run workflow on.'
required: true
default: "llama-stack-gha-runner-gpu"
checkout_reference:
description: "The branch, tag, or SHA to checkout"
required: true
default: "main"
debug:
description: 'Run debugging steps?'
required: false
default: "true"
sleep_time:
description: '[DEBUG] sleep time for debugging'
required: true
default: "0"
provider_id:
description: 'ID of your provider'
required: true
default: "meta_reference"
model_id:
description: 'Shorthand name for target model ID (llama_3b or llama_8b)'
required: true
default: "llama_3b"
model_override_3b:
description: 'Specify shorthand model for <llama_3b> '
required: false
default: "Llama3.2-3B-Instruct"
model_override_8b:
description: 'Specify shorthand model for <llama_8b> '
required: false
default: "Llama3.1-8B-Instruct"
env:
# ID used for each test's provider config
PROVIDER_ID: "${{ inputs.provider_id || 'meta_reference' }}"
# Path to model checkpoints within EFS volume
MODEL_CHECKPOINT_DIR: "/data/llama"
# Path to directory to run tests from
TESTS_PATH: "${{ github.workspace }}/llama_stack/providers/tests"
# Keep track of a list of model IDs that are valid to use within pytest fixture marks
AVAILABLE_MODEL_IDs: "llama_3b llama_8b"
# Shorthand name for model ID, used in pytest fixture marks
MODEL_ID: "${{ inputs.model_id || 'llama_3b' }}"
# Override the `llama_3b` / `llama_8b' models, else use the default.
LLAMA_3B_OVERRIDE: "${{ inputs.model_override_3b || 'Llama3.2-3B-Instruct' }}"
LLAMA_8B_OVERRIDE: "${{ inputs.model_override_8b || 'Llama3.1-8B-Instruct' }}"
# Defines which directories in TESTS_PATH to exclude from the test loop
EXCLUDED_DIRS: "__pycache__"
# Defines the output xml reports generated after a test is run
REPORTS_GEN: ""
jobs:
execute_workflow:
name: Execute workload on Self-Hosted GPU k8s runner
permissions:
pull-requests: write
defaults:
run:
shell: bash
runs-on: ${{ inputs.runner != '' && inputs.runner || 'llama-stack-gha-runner-gpu' }}
if: always()
steps:
##############################
#### INITIAL DEBUG CHECKS ####
##############################
- name: "[DEBUG] Check content of the EFS mount"
id: debug_efs_volume
continue-on-error: true
if: inputs.debug == 'true'
run: |
echo "========= Content of the EFS mount ============="
ls -la ${{ env.MODEL_CHECKPOINT_DIR }}
- name: "[DEBUG] Get runner container OS information"
id: debug_os_info
if: ${{ inputs.debug == 'true' }}
run: |
cat /etc/os-release
- name: "[DEBUG] Print environment variables"
id: debug_env_vars
if: ${{ inputs.debug == 'true' }}
run: |
echo "PROVIDER_ID = ${PROVIDER_ID}"
echo "MODEL_CHECKPOINT_DIR = ${MODEL_CHECKPOINT_DIR}"
echo "AVAILABLE_MODEL_IDs = ${AVAILABLE_MODEL_IDs}"
echo "MODEL_ID = ${MODEL_ID}"
echo "LLAMA_3B_OVERRIDE = ${LLAMA_3B_OVERRIDE}"
echo "LLAMA_8B_OVERRIDE = ${LLAMA_8B_OVERRIDE}"
echo "EXCLUDED_DIRS = ${EXCLUDED_DIRS}"
echo "REPORTS_GEN = ${REPORTS_GEN}"
############################
#### MODEL INPUT CHECKS ####
############################
- name: "Check if env.model_id is valid"
id: check_model_id
run: |
if [[ " ${AVAILABLE_MODEL_IDs[@]} " =~ " ${MODEL_ID} " ]]; then
echo "Model ID '${MODEL_ID}' is valid."
else
echo "Model ID '${MODEL_ID}' is invalid. Terminating workflow."
exit 1
fi
#######################
#### CODE CHECKOUT ####
#######################
- name: "Checkout 'meta-llama/llama-stack' repository"
id: checkout_repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.branch }}
- name: "[DEBUG] Content of the repository after checkout"
id: debug_content_after_checkout
if: ${{ inputs.debug == 'true' }}
run: |
ls -la ${GITHUB_WORKSPACE}
##########################################################
#### OPTIONAL SLEEP DEBUG ####
# #
# Use to "exec" into the test k8s POD and run tests #
# manually to identify what dependencies are being used. #
# #
##########################################################
- name: "[DEBUG] sleep"
id: debug_sleep
if: ${{ inputs.debug == 'true' && inputs.sleep_time != '' }}
run: |
sleep ${{ inputs.sleep_time }}
############################
#### UPDATE SYSTEM PATH ####
############################
- name: "Update path: execute"
id: path_update_exec
run: |
# .local/bin is needed for certain libraries installed below to be recognized
# when calling their executable to install sub-dependencies
mkdir -p ${HOME}/.local/bin
echo "${HOME}/.local/bin" >> "$GITHUB_PATH"
#####################################
#### UPDATE CHECKPOINT DIRECTORY ####
#####################################
- name: "Update checkpoint directory"
id: checkpoint_update
run: |
echo "Checkpoint directory: ${MODEL_CHECKPOINT_DIR}/$LLAMA_3B_OVERRIDE"
if [ "${MODEL_ID}" = "llama_3b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" ]; then
echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_3B_OVERRIDE}" >> "$GITHUB_ENV"
elif [ "${MODEL_ID}" = "llama_8b" ] && [ -d "${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" ]; then
echo "MODEL_CHECKPOINT_DIR=${MODEL_CHECKPOINT_DIR}/${LLAMA_8B_OVERRIDE}" >> "$GITHUB_ENV"
else
echo "MODEL_ID & LLAMA_*B_OVERRIDE are not a valid pairing. Terminating workflow."
exit 1
fi
- name: "[DEBUG] Checkpoint update check"
id: debug_checkpoint_update
if: ${{ inputs.debug == 'true' }}
run: |
echo "MODEL_CHECKPOINT_DIR (after update) = ${MODEL_CHECKPOINT_DIR}"
##################################
#### DEPENDENCY INSTALLATIONS ####
##################################
- name: "Installing 'apt' required packages"
id: install_apt
run: |
echo "[STEP] Installing 'apt' required packages"
sudo apt update -y
sudo apt install -y python3 python3-pip npm wget
- name: "Installing packages with 'curl'"
id: install_curl
run: |
curl -fsSL https://ollama.com/install.sh | sh
- name: "Installing packages with 'wget'"
id: install_wget
run: |
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh -b install -c pytorch -c nvidia faiss-gpu=1.9.0
# Add miniconda3 bin to system path
echo "${HOME}/miniconda3/bin" >> "$GITHUB_PATH"
- name: "Installing packages with 'npm'"
id: install_npm_generic
run: |
sudo npm install -g junit-merge
- name: "Installing pip dependencies"
id: install_pip_generic
run: |
echo "[STEP] Installing 'llama-stack' models"
pip install -U pip setuptools
pip install -r requirements.txt
pip install -e .
pip install -U \
torch torchvision \
pytest pytest_asyncio \
fairscale lm-format-enforcer \
zmq chardet pypdf \
pandas sentence_transformers together \
aiosqlite
- name: "Installing packages with conda"
id: install_conda_generic
run: |
conda install -q -c pytorch -c nvidia faiss-gpu=1.9.0
#############################################################
#### TESTING TO BE DONE FOR BOTH PRS AND MANUAL DISPATCH ####
#############################################################
- name: "Run Tests: Loop"
id: run_tests_loop
working-directory: "${{ github.workspace }}"
run: |
pattern=""
for dir in llama_stack/providers/tests/*; do
if [ -d "$dir" ]; then
dir_name=$(basename "$dir")
if [[ ! " $EXCLUDED_DIRS " =~ " $dir_name " ]]; then
for file in "$dir"/test_*.py; do
test_name=$(basename "$file")
new_file="result-${dir_name}-${test_name}.xml"
if torchrun $(which pytest) -s -v ${TESTS_PATH}/${dir_name}/${test_name} -m "${PROVIDER_ID} and ${MODEL_ID}" \
--junitxml="${{ github.workspace }}/${new_file}"; then
echo "Ran test: ${test_name}"
else
echo "Did NOT run test: ${test_name}"
fi
pattern+="${new_file} "
done
fi
fi
done
echo "REPORTS_GEN=$pattern" >> "$GITHUB_ENV"
- name: "Test Summary: Merge"
id: test_summary_merge
working-directory: "${{ github.workspace }}"
run: |
echo "Merging the following test result files: ${REPORTS_GEN}"
# Defaults to merging them into 'merged-test-results.xml'
junit-merge ${{ env.REPORTS_GEN }}
############################################
#### AUTOMATIC TESTING ON PULL REQUESTS ####
############################################
#### Run tests ####
- name: "PR - Run Tests"
id: pr_run_tests
working-directory: "${{ github.workspace }}"
if: github.event_name == 'pull_request_target'
run: |
echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${GITHUB_WORKSPACE} | path: ${{ github.workspace }}"
# (Optional) Add more tests here.
# Merge test results with 'merged-test-results.xml' from above.
# junit-merge <new-test-results> merged-test-results.xml
#### Create test summary ####
- name: "PR - Test Summary"
id: pr_test_summary_create
if: github.event_name == 'pull_request_target'
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
with:
paths: "${{ github.workspace }}/merged-test-results.xml"
output: test-summary.md
- name: "PR - Upload Test Summary"
id: pr_test_summary_upload
if: github.event_name == 'pull_request_target'
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: test-summary
path: test-summary.md
#### Update PR request ####
- name: "PR - Update comment"
id: pr_update_comment
if: github.event_name == 'pull_request_target'
uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
with:
filePath: test-summary.md
########################
#### MANUAL TESTING ####
########################
#### Run tests ####
- name: "Manual - Run Tests: Prep"
id: manual_run_tests
working-directory: "${{ github.workspace }}"
if: github.event_name == 'workflow_dispatch'
run: |
echo "[STEP] Running PyTest tests at 'GITHUB_WORKSPACE' path: ${{ github.workspace }}"
#TODO Use this when collection errors are resolved
# pytest -s -v -m "${PROVIDER_ID} and ${MODEL_ID}" --junitxml="${{ github.workspace }}/merged-test-results.xml"
# (Optional) Add more tests here.
# Merge test results with 'merged-test-results.xml' from above.
# junit-merge <new-test-results> merged-test-results.xml
#### Create test summary ####
- name: "Manual - Test Summary"
id: manual_test_summary
if: always() && github.event_name == 'workflow_dispatch'
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
with:
paths: "${{ github.workspace }}/merged-test-results.xml"

View file

@ -1,26 +0,0 @@
name: Installer CI
on:
pull_request:
paths:
- 'install.sh'
push:
paths:
- 'install.sh'
schedule:
- cron: '0 2 * * *' # every day at 02:00 UTC
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
- name: Run ShellCheck on install.sh
run: shellcheck install.sh
smoke-test:
needs: lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
- name: Run installer end-to-end
run: ./install.sh

View file

@ -1,132 +0,0 @@
name: Integration Auth Tests
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
paths:
- 'distributions/**'
- 'llama_stack/**'
- 'tests/integration/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/integration-auth-tests.yml' # This workflow
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
test-matrix:
runs-on: ubuntu-latest
strategy:
matrix:
auth-provider: [oauth2_token]
fail-fast: false # we want to run all tests regardless of failure
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Build Llama Stack
run: |
llama stack build --template ollama --image-type venv
- name: Install minikube
if: ${{ matrix.auth-provider == 'kubernetes' }}
uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
- name: Start minikube
if: ${{ matrix.auth-provider == 'oauth2_token' }}
run: |
minikube start
kubectl get pods -A
- name: Configure Kube Auth
if: ${{ matrix.auth-provider == 'oauth2_token' }}
run: |
kubectl create namespace llama-stack
kubectl create serviceaccount llama-stack-auth -n llama-stack
kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
cat <<EOF | kubectl apply -f -
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: allow-anonymous-openid
rules:
- nonResourceURLs: ["/openid/v1/jwks"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: allow-anonymous-openid
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: allow-anonymous-openid
subjects:
- kind: User
name: system:anonymous
apiGroup: rbac.authorization.k8s.io
EOF
- name: Set Kubernetes Config
if: ${{ matrix.auth-provider == 'oauth2_token' }}
run: |
echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
- name: Set Kube Auth Config and run server
env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
if: ${{ matrix.auth-provider == 'oauth2_token' }}
run: |
run_dir=$(mktemp -d)
cat <<'EOF' > $run_dir/run.yaml
version: '2'
image_name: kube
apis: []
providers: {}
server:
port: 8321
EOF
yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
cat $run_dir/run.yaml
nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
- name: Wait for Llama Stack server to be ready
run: |
echo "Waiting for Llama Stack server..."
for i in {1..30}; do
if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
echo "Llama Stack server is up!"
if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
exit 0
else
echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
cat server.log
exit 1
fi
fi
sleep 1
done
echo "Llama Stack server failed to start"
cat server.log
exit 1
- name: Test auth
run: |
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq

View file

@ -1,116 +0,0 @@
name: Integration Tests
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
paths:
- 'llama_stack/**'
- 'tests/integration/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/integration-tests.yml' # This workflow
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
test-matrix:
runs-on: ubuntu-latest
strategy:
matrix:
# Listing tests manually since some of them currently fail
# TODO: generate matrix list from tests/integration when fixed
test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
client-type: [library, http]
fail-fast: false # we want to run all tests regardless of failure
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Setup ollama
uses: ./.github/actions/setup-ollama
- name: Build Llama Stack
run: |
llama stack build --template ollama --image-type venv
- name: Start Llama Stack server in background
if: matrix.client-type == 'http'
env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: |
LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
- name: Wait for Llama Stack server to be ready
if: matrix.client-type == 'http'
run: |
echo "Waiting for Llama Stack server..."
for i in {1..30}; do
if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
echo "Llama Stack server is up!"
exit 0
fi
sleep 1
done
echo "Llama Stack server failed to start"
cat server.log
exit 1
- name: Verify Ollama status is OK
if: matrix.client-type == 'http'
run: |
echo "Verifying Ollama status..."
ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
echo "Ollama status: $ollama_status"
if [ "$ollama_status" != "OK" ]; then
echo "Ollama health check failed"
exit 1
fi
- name: Check Storage and Memory Available Before Tests
if: ${{ always() }}
run: |
free -h
df -h
- name: Run Integration Tests
env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: |
if [ "${{ matrix.client-type }}" == "library" ]; then
stack_config="ollama"
else
stack_config="http://localhost:8321"
fi
uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
-k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
--text-model="meta-llama/Llama-3.2-3B-Instruct" \
--embedding-model=all-MiniLM-L6-v2
- name: Check Storage and Memory Available After Tests
if: ${{ always() }}
run: |
free -h
df -h
- name: Write ollama logs to file
if: ${{ always() }}
run: |
sudo journalctl -u ollama.service > ollama.log
- name: Upload all logs to artifacts
if: ${{ always() }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
path: |
*.log
retention-days: 1

View file

@ -1,45 +0,0 @@
name: Pre-commit
on:
pull_request:
push:
branches: [main]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: '3.11'
cache: pip
cache-dependency-path: |
**/requirements*.txt
.pre-commit-config.yaml
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
env:
SKIP: no-commit-to-branch
RUFF_OUTPUT_FORMAT: github
- name: Verify if there are any diff files after pre-commit
run: |
git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
- name: Verify if there are any new files after pre-commit
run: |
unstaged_files=$(git ls-files --others --exclude-standard)
if [ -n "$unstaged_files" ]; then
echo "There are uncommitted new files, run pre-commit locally and commit again"
echo "$unstaged_files"
exit 1
fi

View file

@ -1,147 +0,0 @@
name: Test Llama Stack Build
on:
push:
branches:
- main
paths:
- 'llama_stack/cli/stack/build.py'
- 'llama_stack/cli/stack/_build.py'
- 'llama_stack/distribution/build.*'
- 'llama_stack/distribution/*.sh'
- '.github/workflows/providers-build.yml'
pull_request:
paths:
- 'llama_stack/cli/stack/build.py'
- 'llama_stack/cli/stack/_build.py'
- 'llama_stack/distribution/build.*'
- 'llama_stack/distribution/*.sh'
- '.github/workflows/providers-build.yml'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
generate-matrix:
runs-on: ubuntu-latest
outputs:
templates: ${{ steps.set-matrix.outputs.templates }}
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Generate Template List
id: set-matrix
run: |
templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
echo "templates=$templates" >> "$GITHUB_OUTPUT"
build:
needs: generate-matrix
runs-on: ubuntu-latest
strategy:
matrix:
template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
image-type: [venv, container]
fail-fast: false # We want to run all jobs even if some fail
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Print build dependencies
run: |
uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
- name: Run Llama Stack Build
run: |
# USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
# LLAMA_STACK_DIR is set to the current directory so we are building from the source
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
- name: Print dependencies in the image
if: matrix.image-type == 'venv'
run: |
uv pip list
build-single-provider:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Build a single provider
run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
build-custom-container-distribution:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Build a single provider
run: |
yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
- name: Inspect the container image entrypoint
run: |
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
echo "Entrypoint is not correct"
exit 1
fi
build-ubi9-container-distribution:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Pin template to UBI9 base
run: |
yq -i '
.image_type = "container" |
.image_name = "ubi9-test" |
.distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
' llama_stack/templates/starter/build.yaml
- name: Build dev container (UBI9)
env:
USE_COPY_NOT_MOUNT: "true"
LLAMA_STACK_DIR: "."
run: |
uv run llama stack build --config llama_stack/templates/starter/build.yaml
- name: Inspect UBI9 image
run: |
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
echo "Entrypoint: $entrypoint"
if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
echo "Entrypoint is not correct"
exit 1
fi
echo "Checking /etc/os-release in $IMAGE_ID"
docker run --rm --entrypoint sh "$IMAGE_ID" -c \
'source /etc/os-release && echo "$ID"' \
| grep -qE '^(rhel|ubi)$' \
|| { echo "Base image is not UBI 9!"; exit 1; }

View file

@ -1,25 +0,0 @@
name: Check semantic PR titles
on:
pull_request_target:
types:
- opened
- edited
- reopened
- synchronize
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
title-check:
runs-on: ubuntu-latest
steps:
- name: Check PR Title's semantic conformance
uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View file

@ -1,45 +0,0 @@
name: Close stale issues and PRs
on:
schedule:
- cron: '0 0 * * *' # every day at midnight
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
stale:
permissions:
issues: write
pull-requests: write
runs-on: ubuntu-latest
steps:
- name: Stale Action
uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with:
stale-issue-label: 'stale'
stale-issue-message: >
This issue has been automatically marked as stale because it has not had activity within 60 days.
It will be automatically closed if no further activity occurs within 30 days.
close-issue-message: >
This issue has been automatically closed due to inactivity.
Please feel free to reopen if you feel it is still relevant!
days-before-issue-stale: 60
days-before-issue-close: 30
stale-pr-label: 'stale'
stale-pr-message: >
This pull request has been automatically marked as stale because it has not had activity within 60 days.
It will be automatically closed if no further activity occurs within 30 days.
close-pr-message: >
This pull request has been automatically closed due to inactivity.
Please feel free to reopen if you intend to continue working on it!
days-before-pr-stale: 60
days-before-pr-close: 30
operations-per-run: 300

View file

@ -1,71 +0,0 @@
name: Test External Providers
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
paths:
- 'llama_stack/**'
- 'tests/integration/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/test-external-providers.yml' # This workflow
jobs:
test-external-providers:
runs-on: ubuntu-latest
strategy:
matrix:
image-type: [venv]
# We don't do container yet, it's tricky to install a package from the host into the
# container and point 'uv pip install' to the correct path...
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Apply image type to config file
run: |
yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
- name: Setup directory for Ollama custom provider
run: |
mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
- name: Create provider configuration
run: |
mkdir -p /home/runner/.llama/providers.d/remote/inference
cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
- name: Build distro from config file
run: |
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
- name: Start Llama Stack server in background
if: ${{ matrix.image-type }} == 'venv'
env:
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
run: |
uv run pip list
nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
- name: Wait for Llama Stack server to be ready
run: |
for i in {1..30}; do
if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
echo "Waiting for Llama Stack server to load the provider..."
sleep 1
else
echo "Provider loaded"
exit 0
fi
done
echo "Provider failed to load"
cat server.log
exit 1

View file

@ -1,69 +0,0 @@
name: auto-tests
on:
# pull_request:
workflow_dispatch:
inputs:
commit_sha:
description: 'Specific Commit SHA to trigger on'
required: false
default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
jobs:
test-llama-stack-as-library:
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
strategy:
matrix:
provider: [fireworks, together]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ github.event.inputs.commit_sha }}
- name: Echo commit SHA
run: |
echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
git rev-parse HEAD
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt pytest
pip install -e .
- name: Build providers
run: |
llama stack build --template ${{ matrix.provider }} --image-type venv
- name: Install the latest llama-stack-client & llama-models packages
run: |
pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
- name: Run client-sdk test
working-directory: "${{ github.workspace }}"
env:
REPORT_OUTPUT: md_report.md
shell: bash
run: |
pip install --upgrade pytest-md-report
echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
- name: Output reports to the job summary
if: always()
shell: bash
run: |
if [ -f "$REPORT_FILE" ]; then
echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "</details>" >> $GITHUB_STEP_SUMMARY
fi

View file

@ -1,52 +0,0 @@
name: Unit Tests
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
paths:
- 'llama_stack/**'
- 'tests/unit/**'
- 'uv.lock'
- 'pyproject.toml'
- 'requirements.txt'
- '.github/workflows/unit-tests.yml' # This workflow
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python:
- "3.10"
- "3.11"
- "3.12"
- "3.13"
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Run unit tests
run: |
PYTHON_VERSION=${{ matrix.python }} ./scripts/unit-tests.sh --cov=llama_stack --junitxml=pytest-report-${{ matrix.python }}.xml --cov-report=html:htmlcov-${{ matrix.python }}
- name: Upload test results
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: test-results-${{ matrix.python }}
path: |
.pytest_cache/
pytest-report-${{ matrix.python }}.xml
htmlcov-${{ matrix.python }}/
retention-days: 7

View file

@ -1,68 +0,0 @@
name: Update ReadTheDocs
on:
workflow_dispatch:
inputs:
branch:
description: 'RTD version to update'
required: false
default: 'latest'
push:
branches:
- main
paths:
- 'docs/**'
- 'pyproject.toml'
- '.github/workflows/update-readthedocs.yml'
tags:
- '*'
pull_request:
branches:
- main
paths:
- 'docs/**'
- 'pyproject.toml'
- '.github/workflows/update-readthedocs.yml'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
update-readthedocs:
runs-on: ubuntu-latest
env:
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
steps:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install dependencies
uses: ./.github/actions/setup-runner
- name: Build HTML
run: |
cd docs
uv run make html
- name: Trigger ReadTheDocs build
if: github.event_name != 'pull_request'
run: |
if [ -z "$TOKEN" ]; then
echo "READTHEDOCS_TOKEN is not set"
exit 1
fi
response=$(curl -X POST \
-H "Content-Type: application/json" \
-d "{
\"token\": \"$TOKEN\",
\"version\": \"$GITHUB_REF_NAME\"
}" \
https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
echo "Response: $response"
if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
echo "Failed to trigger ReadTheDocs build"
exit 1
fi

7
.gitignore vendored
View file

@ -6,7 +6,6 @@ dev_requirements.txt
build
.DS_Store
llama_stack/configs/*
.cursor/
xcuserdata/
*.hmap
.DS_Store
@ -19,9 +18,3 @@ Package.resolved
.vscode
_build
docs/src
pyrightconfig.json
venv/
pytest-report.xml
.coverage
.python-version
data

3
.gitmodules vendored Normal file
View file

@ -0,0 +1,3 @@
[submodule "llama_stack/providers/impls/ios/inference/executorch"]
path = llama_stack/providers/inline/ios/inference/executorch
url = https://github.com/pytorch/executorch

View file

@ -5,28 +5,19 @@ default_language_version:
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0 # Latest stable version
rev: 6306a48f7dae5861702d573c9c247e4e9498e867
hooks:
- id: check-merge-conflict
args: ['--assume-in-merge']
- id: trailing-whitespace
exclude: '\.py$' # Exclude Python files as Ruff already handles them
- id: check-ast
- id: check-merge-conflict
- id: check-added-large-files
args: ['--maxkb=1000']
- id: end-of-file-fixer
exclude: '^(.*\.svg)$'
- id: no-commit-to-branch
- id: check-yaml
args: ["--unsafe"]
- id: detect-private-key
- id: requirements-txt-fixer
- id: mixed-line-ending
args: [--fix=lf] # Forces to replace line ending by LF (line feed)
- id: check-executables-have-shebangs
- id: check-json
- id: check-shebang-scripts-are-executable
- id: check-symlinks
- id: check-toml
# Temporarily disabling this
# - id: no-commit-to-branch
# args: ['--branch=main']
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.4
@ -37,46 +28,29 @@ repos:
- --license-filepath
- docs/license_header.txt
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.4
- repo: https://github.com/pycqa/flake8
rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
hooks:
- id: ruff
args: [ --fix ]
exclude: ^llama_stack/strong_typing/.*$
- id: ruff-format
- repo: https://github.com/adamchainz/blacken-docs
rev: 1.19.0
hooks:
- id: blacken-docs
- id: flake8
additional_dependencies:
- black==24.3.0
- flake8-bugbear == 22.4.25
- pep8-naming == 0.12.1
- torchfix
args: ['--config=.flake8']
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.7.8
- repo: https://github.com/omnilib/ufmt
rev: v2.7.0
hooks:
- id: uv-lock
- id: uv-export
args: [
"--frozen",
"--no-hashes",
"--no-emit-project",
"--no-default-groups",
"--output-file=requirements.txt"
]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.15.0
hooks:
- id: mypy
- id: ufmt
additional_dependencies:
- uv==0.6.2
- mypy
- pytest
- rich
- types-requests
- pydantic
pass_filenames: false
- black == 24.4.2
- usort == 1.0.8
# - repo: https://github.com/jsh9/pydoclint
# rev: d88180a8632bb1602a4d81344085cf320f288c5a
# hooks:
# - id: pydoclint
# args: [--config=pyproject.toml]
# - repo: https://github.com/tcort/markdown-link-check
# rev: v3.11.2
@ -84,35 +58,16 @@ repos:
# - id: markdown-link-check
# args: ['--quiet']
- repo: local
hooks:
- id: distro-codegen
name: Distribution Template Codegen
additional_dependencies:
- uv==0.7.8
entry: uv run --group codegen ./scripts/distro_codegen.py
language: python
pass_filenames: false
require_serial: true
files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
- id: openapi-codegen
name: API Spec Codegen
additional_dependencies:
- uv==0.7.8
entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
language: python
pass_filenames: false
require_serial: true
files: ^llama_stack/apis/|^docs/openapi_generator/
- id: check-workflows-use-hashes
name: Check GitHub Actions use SHA-pinned actions
entry: ./scripts/check-workflows-use-hashes.sh
language: system
pass_filenames: false
require_serial: true
always_run: true
files: ^\.github/workflows/.*\.ya?ml$
ci:
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate
# - repo: local
# hooks:
# - id: distro-codegen
# name: Distribution Template Codegen
# additional_dependencies:
# - rich
# - pydantic
# entry: python -m llama_stack.scripts.distro_codegen
# language: python
# pass_filenames: false
# require_serial: true
# files: ^llama_stack/templates/.*$
# stages: [manual]

View file

@ -5,21 +5,28 @@
# Required
version: 2
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/source/conf.py
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.12"
jobs:
pre_create_environment:
- asdf plugin add uv
- asdf install uv latest
- asdf global uv latest
create_environment:
- uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
# You can also specify other tool versions:
# nodejs: "19"
# rust: "1.64"
# golang: "1.19"
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/source/conf.py
# Optionally build your docs in additional formats such as PDF and ePub
# formats:
# - pdf
# - epub
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
- requirements: docs/requirements.txt

View file

@ -1,450 +1,6 @@
# Changelog
# v0.2.7
Published on: 2025-05-16T20:38:10Z
## Highlights
This is a small update. But a couple highlights:
* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
---
# v0.2.6
Published on: 2025-05-12T18:06:52Z
---
# v0.2.5
Published on: 2025-05-04T20:16:49Z
---
# v0.2.4
Published on: 2025-04-29T17:26:01Z
## Highlights
* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
---
# v0.2.3
Published on: 2025-04-25T22:46:21Z
## Highlights
* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
* significant improvements and functionality added to the nVIDIA distribution
* many improvements to the test verification suite.
* new inference providers: Ramalama, IBM WatsonX
* many improvements to the Playground UI
---
# v0.2.2
Published on: 2025-04-13T01:19:49Z
## Main changes
- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
- OpenAI compatible inference API in progress (@bbrowning)
- Provider verifications (@ehhuang)
- Many updates and fixes to playground
- Several llama4 related fixes
---
# v0.2.1
Published on: 2025-04-05T23:13:00Z
---
# v0.2.0
Published on: 2025-04-05T19:04:29Z
## Llama 4 Support
Checkout more at https://www.llama.com
---
# v0.1.9
Published on: 2025-03-29T00:52:23Z
### Build and Test Agents
* Agents: Entire document context with attachments
* RAG: Documentation with sqlite-vec faiss comparison
* Getting started: Fixes to getting started notebook.
### Agent Evals and Model Customization
* (**New**) Post-training: Add nemo customizer
### Better Engineering
* Moved sqlite-vec to non-blocking calls
* Don't return a payload on file delete
---
# v0.1.8
Published on: 2025-03-24T01:28:50Z
# v0.1.8 Release Notes
### Build and Test Agents
* Safety: Integrated NVIDIA as a safety provider.
* VectorDB: Added Qdrant as an inline provider.
* Agents: Added support for multiple tool groups in agents.
* Agents: Simplified imports for Agents in client package
### Agent Evals and Model Customization
* Introduced DocVQA and IfEval benchmarks.
### Deploying and Monitoring Agents
* Introduced a Containerfile and image workflow for the Playground.
* Implemented support for Bearer (API Key) authentication.
* Added attribute-based access control for resources.
* Fixes on docker deployments: use --pull always and standardized the default port to 8321
* Deprecated: /v1/inspect/providers use /v1/providers/ instead
### Better Engineering
* Consolidated scripts under the ./scripts directory.
* Addressed mypy violations in various modules.
* Added Dependabot scans for Python dependencies.
* Implemented a scheduled workflow to update the changelog automatically.
* Enforced concurrency to reduce CI loads.
### New Contributors
* @cmodi-meta made their first contribution in https://github.com/meta-llama/llama-stack/pull/1650
* @jeffmaury made their first contribution in https://github.com/meta-llama/llama-stack/pull/1671
* @derekhiggins made their first contribution in https://github.com/meta-llama/llama-stack/pull/1698
* @Bobbins228 made their first contribution in https://github.com/meta-llama/llama-stack/pull/1745
**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.7...v0.1.8
---
# v0.1.7
Published on: 2025-03-14T22:30:51Z
## 0.1.7 Release Notes
### Build and Test Agents
* Inference: ImageType is now refactored to LlamaStackImageType
* Inference: Added tests to measure TTFT
* Inference: Bring back usage metrics
* Agents: Added endpoint for get agent, list agents and list sessions
* Agents: Automated conversion of type hints in client tool for lite llm format
* Agents: Deprecated ToolResponseMessage in agent.resume API
* Added Provider API for listing and inspecting provider info
### Agent Evals and Model Customization
* Eval: Added new eval benchmarks Math 500 and BFCL v3
* Deploy and Monitoring of Agents
* Telemetry: Fix tracing to work across coroutines
### Better Engineering
* Display code coverage for unit tests
* Updated call sites (inference, tool calls, agents) to move to async non blocking calls
* Unit tests also run on Python 3.11, 3.12, and 3.13
* Added ollama inference to Integration tests CI
* Improved documentation across examples, testing, CLI, updated providers table )
---
# v0.1.6
Published on: 2025-03-08T04:35:08Z
## 0.1.6 Release Notes
### Build and Test Agents
* Inference: Fixed support for inline vllm provider
* (**New**) Agent: Build & Monitor Agent Workflows with Llama Stack + Anthropic's Best Practice [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Agent_Workflows.ipynb)
* (**New**) Agent: Revamped agent [documentation](https://llama-stack.readthedocs.io/en/latest/building_applications/agent.html) with more details and examples
* Agent: Unify tools and Python SDK Agents API
* Agent: AsyncAgent Python SDK wrapper supporting async client tool calls
* Agent: Support python functions without @client_tool decorator as client tools
* Agent: deprecation for allow_resume_turn flag, and remove need to specify tool_prompt_format
* VectorIO: MilvusDB support added
### Agent Evals and Model Customization
* (**New**) Agent: Llama Stack RAG Lifecycle [Notebook](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_RAG_Lifecycle.ipynb)
* Eval: Documentation for eval, scoring, adding new benchmarks
* Eval: Distribution template to run benchmarks on llama & non-llama models
* Eval: Ability to register new custom LLM-as-judge scoring functions
* (**New**) Looking for contributors for open benchmarks. See [documentation](https://llama-stack.readthedocs.io/en/latest/references/evals_reference/index.html#open-benchmark-contributing-guide) for details.
### Deploy and Monitoring of Agents
* Better support for different log levels across all components for better monitoring
### Better Engineering
* Enhance OpenAPI spec to include Error types across all APIs
* Moved all tests to /tests and created unit tests to run on each PR
* Removed all dependencies on llama-models repo
---
# v0.1.5.1
Published on: 2025-02-28T22:37:44Z
## 0.1.5.1 Release Notes
* Fixes for security risk in https://github.com/meta-llama/llama-stack/pull/1327 and https://github.com/meta-llama/llama-stack/pull/1328
**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.1.5...v0.1.5.1
---
# v0.1.5
Published on: 2025-02-28T18:14:01Z
## 0.1.5 Release Notes
### Build Agents
* Inference: Support more non-llama models (openai, anthropic, gemini)
* Inference: Can use the provider's model name in addition to the HF alias
* Inference: Fixed issues with calling tools that weren't specified in the prompt
* RAG: Improved system prompt for RAG and no more need for hard-coded rag-tool calling
* Embeddings: Added support for Nemo retriever embedding models
* Tools: Added support for MCP tools in Ollama Distribution
* Distributions: Added new Groq distribution
### Customize Models
* Save post-trained checkpoint in SafeTensor format to allow Ollama inference provider to use the post-trained model
### Monitor agents
* More comprehensive logging of agent steps including client tools
* Telemetry inputs/outputs are now structured and queryable
* Ability to retrieve agents session, turn, step by ids
### Better Engineering
* Moved executorch Swift code out of this repo into the llama-stack-client-swift repo, similar to kotlin
* Move most logging to use logger instead of prints
* Completed text /chat-completion and /completion tests
---
# v0.1.4
Published on: 2025-02-25T00:02:43Z
## v0.1.4 Release Notes
Here are the key changes coming as part of this release:
### Build and Test Agents
* Inference: Added support for non-llama models
* Inference: Added option to list all downloaded models and remove models
* Agent: Introduce new api agents.resume_turn to include client side tool execution in the same turn
* Agent: AgentConfig introduces new variable “tool_config” that allows for better tool configuration and system prompt overrides
* Agent: Added logging for agent step start and completion times
* Agent: Added support for logging for tool execution metadata
* Embedding: Updated /inference/embeddings to support asymmetric models, truncation and variable sized outputs
* Embedding: Updated embedding models for Ollama, Together, and Fireworks with available defaults
* VectorIO: Improved performance of sqlite-vec using chunked writes
### Agent Evals and Model Customization
* Deprecated api /eval-tasks. Use /eval/benchmark instead
* Added CPU training support for TorchTune
### Deploy and Monitoring of Agents
* Consistent view of client and server tool calls in telemetry
### Better Engineering
* Made tests more data-driven for consistent evaluation
* Fixed documentation links and improved API reference generation
* Various small fixes for build scripts and system reliability
---
# v0.1.3
Published on: 2025-02-14T20:24:32Z
## v0.1.3 Release
Here are some key changes that are coming as part of this release.
### Build and Test Agents
Streamlined the initial development experience
- Added support for llama stack run --image-type venv
- Enhanced vector store options with new sqlite-vec provider and improved Qdrant integration
- vLLM improvements for tool calling and logprobs
- Better handling of sporadic code_interpreter tool calls
### Agent Evals
Better benchmarking and Agent performance assessment
- Renamed eval API /eval-task to /benchmarks
- Improved documentation and notebooks for RAG and evals
### Deploy and Monitoring of Agents
Improved production readiness
- Added usage metrics collection for chat completions
- CLI improvements for provider information
- Improved error handling and system reliability
- Better model endpoint handling and accessibility
- Improved signal handling on distro server
### Better Engineering
Infrastructure and code quality improvements
- Faster text-based chat completion tests
- Improved testing for non-streaming agent apis
- Standardized import formatting with ruff linter
- Added conventional commits standard
- Fixed documentation parsing issues
---
# v0.1.2
Published on: 2025-02-07T22:06:49Z
# TL;DR
- Several stabilizations to development flows after the switch to `uv`
- Migrated CI workflows to new OSS repo - [llama-stack-ops](https://github.com/meta-llama/llama-stack-ops)
- Added automated rebuilds for ReadTheDocs
- Llama Stack server supports HTTPS
- Added system prompt overrides support
- Several bug fixes and improvements to documentation (check out Kubernetes deployment guide by @terrytangyuan )
---
# v0.1.1
Published on: 2025-02-02T02:29:24Z
A bunch of small / big improvements everywhere including support for Windows, switching to `uv` and many provider improvements.
---
# v0.1.0
Published on: 2025-01-24T17:47:47Z
We are excited to announce a stable API release of Llama Stack, which enables developers to build RAG applications and Agents using tools and safety shields, monitor and those agents with telemetry, and evaluate the agent with scoring functions.
## Context
GenAI application developers need more than just an LLM - they need to integrate tools, connect with their data sources, establish guardrails, and ground the LLM responses effectively. Currently, developers must piece together various tools and APIs, complicating the development lifecycle and increasing costs. The result is that developers are spending more time on these integrations rather than focusing on the application logic itself. The bespoke coupling of components also makes it challenging to adopt state-of-the-art solutions in the rapidly evolving GenAI space. This is particularly difficult for open models like Llama, as best practices are not widely established in the open.
Llama Stack was created to provide developers with a comprehensive and coherent interface that simplifies AI application development and codifies best practices across the Llama ecosystem. Since our launch in September 2024, we have seen a huge uptick in interest in Llama Stack APIs by both AI developers and from partners building AI services with Llama models. Partners like Nvidia, Fireworks, and Ollama have collaborated with us to develop implementations across various APIs, including inference, memory, and safety.
With Llama Stack, you can easily build a RAG agent which can also search the web, do complex math, and custom tool calling. You can use telemetry to inspect those traces, and convert telemetry into evals datasets. And with Llama Stacks plugin architecture and prepackage distributions, you choose to run your agent anywhere - in the cloud with our partners, deploy your own environment using virtualenv, conda, or Docker, operate locally with Ollama, or even run on mobile devices with our SDKs. Llama Stack offers unprecedented flexibility while also simplifying the developer experience.
## Release
After iterating on the APIs for the last 3 months, today were launching a stable release (V1) of the Llama Stack APIs and the corresponding llama-stack server and client packages(v0.1.0). We now have automated tests for providers. These tests make sure that all provider implementations are verified. Developers can now easily and reliably select distributions or providers based on their specific requirements.
There are example standalone apps in llama-stack-apps.
## Key Features of this release
- **Unified API Layer**
- Inference: Run LLM models
- RAG: Store and retrieve knowledge for RAG
- Agents: Build multi-step agentic workflows
- Tools: Register tools that can be called by the agent
- Safety: Apply content filtering and safety policies
- Evaluation: Test model and agent quality
- Telemetry: Collect and analyze usage data and complex agentic traces
- Post Training ( Coming Soon ): Fine tune models for specific use cases
- **Rich Provider Ecosystem**
- Local Development: Meta's Reference, Ollama
- Cloud: Fireworks, Together, Nvidia, AWS Bedrock, Groq, Cerebras
- On-premises: Nvidia NIM, vLLM, TGI, Dell-TGI
- On-device: iOS and Android support
- **Built for Production**
- Pre-packaged distributions for common deployment scenarios
- Backwards compatibility across model versions
- Comprehensive evaluation capabilities
- Full observability and monitoring
- **Multiple developer interfaces**
- CLI: Command line interface
- Python SDK
- Swift iOS SDK
- Kotlin Android SDK
- **Sample llama stack applications**
- Python
- iOS
- Android
---
# v0.1.0rc12
Published on: 2025-01-22T22:24:01Z
---
# v0.0.63
Published on: 2024-12-18T07:17:43Z
A small but important bug-fix release to update the URL datatype for the client-SDKs. The issue affected multimodal agentic turns especially.
**Full Changelog**: https://github.com/meta-llama/llama-stack/compare/v0.0.62...v0.0.63
---
# v0.0.62
Published on: 2024-12-18T02:39:43Z
---
# v0.0.61
Published on: 2024-12-10T20:50:33Z
---
# v0.0.55
Published on: 2024-11-23T17:14:07Z
---
# v0.0.54
Published on: 2024-11-22T00:36:09Z
---
# v0.0.53
Published on: 2024-11-20T22:18:00Z
🚀 Initial Release Notes for Llama Stack!
## 0.0.53
### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
@ -477,6 +33,3 @@ Published on: 2024-11-20T22:18:00Z
### Removed
- `llama stack configure` command
---

View file

@ -2,45 +2,47 @@
We want to make contributing to this project as easy and transparent as
possible.
## Discussions -> Issues -> Pull Requests
We actively welcome your pull requests. However, please read the following. This is heavily inspired by [Ghostty](https://github.com/ghostty-org/ghostty/blob/main/CONTRIBUTING.md).
If in doubt, please open a [discussion](https://github.com/meta-llama/llama-stack/discussions); we can always convert that to an issue later.
**I'd like to contribute!**
All issues are actionable (please report if they are not.) Pick one and start working on it. Thank you.
If you need help or guidance, comment on the issue. Issues that are extra friendly to new contributors are tagged with "contributor friendly".
**I have a bug!**
1. Search the issue tracker and discussions for similar issues.
2. If you don't have steps to reproduce, open a discussion.
3. If you have steps to reproduce, open an issue.
**I have an idea for a feature!**
1. Open a discussion.
**I've implemented a feature!**
1. If there is an issue for the feature, open a pull request.
2. If there is no issue, open a discussion and link to your branch.
**I have a question!**
1. Open a discussion or use [Discord](https://discord.gg/llama-stack).
**Opening a Pull Request**
## Pull Requests
We actively welcome your pull requests.
1. Fork the repo and create your branch from `main`.
2. If you've changed APIs, update the documentation.
3. Ensure the test suite passes.
4. Make sure your code lints using `pre-commit`.
5. If you haven't already, complete the Contributor License Agreement ("CLA").
6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA").
### Updating Provider Configurations
If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `python llama_stack/scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.
### Building the Documentation
If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
```bash
cd llama-stack/docs
pip install -r requirements.txt
pip install sphinx-autobuild
# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
make html
sphinx-autobuild source build/html
```
## Pre-commit Hooks
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash
$ cd llama-stack
$ conda activate <your-environment>
$ pip install pre-commit
$ pre-commit install
```
After that, pre-commit hooks will run automatically before each commit.
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
@ -56,133 +58,13 @@ Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Set up your development environment
We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
You can install the dependencies by running:
```bash
cd llama-stack
uv sync --extra dev
uv pip install -e .
source .venv/bin/activate
```
> [!NOTE]
> You can pin a specific version of Python to use for `uv` by adding a `.python-version` file in the root project directory.
> Otherwise, `uv` will automatically select a Python version according to the `requires-python` section of the `pyproject.toml`.
> For more info, see the [uv docs around Python versions](https://docs.astral.sh/uv/concepts/python-versions/).
Note that you can create a dotenv file `.env` that includes necessary environment variables:
```
LLAMA_STACK_BASE_URL=http://localhost:8321
LLAMA_STACK_CLIENT_LOG=debug
LLAMA_STACK_PORT=8321
LLAMA_STACK_CONFIG=<provider-name>
TAVILY_SEARCH_API_KEY=
BRAVE_SEARCH_API_KEY=
```
And then use this dotenv file when running client SDK tests via the following:
```bash
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
```
## Pre-commit Hooks
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash
uv run pre-commit install
```
After that, pre-commit hooks will run automatically before each commit.
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
```bash
uv run pre-commit run --all-files
```
> [!CAUTION]
> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
## Running tests
You can find the Llama Stack testing documentation here [here](tests/README.md).
## Adding a new dependency to the project
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
```bash
uv add foo
uv sync
```
## Coding Style
* 2 spaces for indentation rather than tabs
* 80 character line length
* ...
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
describe the next step, as they create unnecessary clutter, same goes for docstrings.
* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
rather than explain what the next line of code does.
* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
`Exception`.
* Error messages should be prefixed with "Failed to ..."
* 4 spaces for indentation rather than tab
* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
justification for bypassing the check.
* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
justification for bypassing the check.
* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
readability reasons.
## Common Tasks
Some tips about common tasks you work on while contributing to Llama Stack:
### Using `llama stack build`
Building a stack image (conda / docker) will use the production version of the `llama-stack` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_STACK_CLIENT_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
Example:
```bash
cd work/
git clone https://github.com/meta-llama/llama-stack.git
git clone https://github.com/meta-llama/llama-stack-client-python.git
cd llama-stack
LLAMA_STACK_DIR=$(pwd) LLAMA_STACK_CLIENT_DIR=../llama-stack-client-python llama stack build --template <...>
```
### Updating Provider Configurations
If you have made changes to a provider's configuration in any form (introducing a new config key, or changing models, etc.), you should run `./scripts/distro_codegen.py` to re-generate various YAML files as well as the documentation. You should not change `docs/source/.../distributions/` files manually as they are auto-generated.
### Building the Documentation
If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
```bash
# This rebuilds the documentation pages.
uv run --group docs make -C docs/ html
# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
```
### Update API Documentation
If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
```bash
uv run ./docs/openapi_generator/run_openapi_generator.sh
```
The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
## Tips
* If you are developing with a llama-stack repository checked out and need your distribution to reflect changes from there, set `LLAMA_STACK_DIR` to that dir when running any of the `llama` CLI commands.
## License
By contributing to Llama, you agree that your contributions will be licensed

View file

@ -1,9 +1,5 @@
include pyproject.toml
include llama_stack/models/llama/llama3/tokenizer.model
include llama_stack/models/llama/llama4/tokenizer.model
include requirements.txt
include distributions/dependencies.json
include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh
include llama_stack/templates/*/*.yaml
include llama_stack/providers/tests/test_cases/inference/*.json
include llama_stack/models/llama/*/*.md
include llama_stack/tests/integration/*.jpg

214
README.md
View file

@ -1,177 +1,121 @@
<img src="https://github.com/user-attachments/assets/33d9576d-95ea-468d-95e2-8fa233205a50" width="480" title="Llama Stack" alt="Llama Stack"/>
# Llama Stack
[![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
[![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
[![Discord](https://img.shields.io/discord/1257833999603335178?color=6A7EC2&logo=discord&logoColor=ffffff)](https://discord.gg/llama-stack)
[![Unit Tests](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
[![Integration Tests](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml/badge.svg?branch=main)](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Zero2Hero Guide**](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide)
### ✨🎉 Llama 4 Support 🎉✨
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
This repository contains the Llama Stack API specifications as well as API Providers and Llama Stack Distributions.
<details>
The Llama Stack defines and standardizes the building blocks needed to bring generative AI applications to market. These blocks span the entire development lifecycle: from model training and fine-tuning, through product evaluation, to building and running AI agents in production. Beyond definition, we are building providers for the Llama Stack APIs. These were developing open-source versions and partnering with providers, ensuring developers can assemble AI solutions using consistent, interlocking pieces across platforms. The ultimate goal is to accelerate innovation in the AI space.
<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
\
*Note you need 8xH100 GPU-host to run these models*
```bash
pip install -U llama_stack
MODEL="Llama-4-Scout-17B-16E-Instruct"
# get meta url from llama.com
llama model download --source meta --model-id $MODEL --meta-url <META_URL>
# start a llama stack server
INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
# install client to interact with the server
pip install llama-stack-client
```
### CLI
```bash
# Run a chat completion
llama-stack-client --endpoint http://localhost:8321 \
inference chat-completion \
--model-id meta-llama/$MODEL \
--message "write a haiku for meta's llama 4 models"
ChatCompletionResponse(
completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
logprobs=None,
metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
)
```
### Python SDK
```python
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url=f"http://localhost:8321")
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
prompt = "Write a haiku about coding"
print(f"User> {prompt}")
response = client.inference.chat_completion(
model_id=model_id,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
)
print(f"Assistant> {response.completion_message.content}")
```
As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
The Stack APIs are rapidly improving, but still very much work in progress and we invite feedback as well as direct contributions.
</details>
## APIs
### 🚀 One-Line Installer 🚀
The Llama Stack consists of the following set of APIs:
To try Llama Stack locally, run:
- Inference
- Safety
- Memory
- Agentic System
- Evaluation
- Post Training
- Synthetic Data Generation
- Reward Scoring
```bash
curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
```
Each of the APIs themselves is a collection of REST endpoints.
### Overview
Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
## API Providers
- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
- **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack.
A Provider is what makes the API real -- they provide the actual implementation backing the API.
<div style="text-align: center;">
<img
src="https://github.com/user-attachments/assets/33d9576d-95ea-468d-95e2-8fa233205a50"
width="480"
title="Llama Stack"
alt="Llama Stack"
/>
</div>
As an example, for Inference, we could have the implementation be backed by open source libraries like `[ torch | vLLM | TensorRT ]` as possible options.
### Llama Stack Benefits
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choices.
- **Consistent Experience**: With its unified APIs, Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
- **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.
A provider can also be just a pointer to a remote REST service -- for example, cloud providers or dedicated inference providers could serve these APIs.
By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.
## Llama Stack Distribution
A Distribution is where APIs and Providers are assembled together to provide a consistent whole to the end application developer. You can mix-and-match providers -- some could be backed by local code and some could be remote. As a hobbyist, you can serve a small model locally, but can choose a cloud provider for a large model. Regardless, the higher level APIs your app needs to work with don't need to change at all. You can even imagine moving across the server / mobile-device boundary as well always using the same uniform set of APIs for developing Generative AI applications.
## Supported Llama Stack Implementations
### API Providers
Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | |
| SambaNova | Hosted | | ✅ | | ✅ | | |
| Cerebras | Hosted | | ✅ | | | | |
| Fireworks | Hosted | ✅ | ✅ | ✅ | | | |
| AWS Bedrock | Hosted | | ✅ | | ✅ | | |
| Together | Hosted | ✅ | ✅ | | ✅ | | |
| Groq | Hosted | | ✅ | | | | |
| Ollama | Single Node | | ✅ | | | | |
| TGI | Hosted and Single Node | | ✅ | | | | |
| NVIDIA NIM | Hosted and Single Node | | ✅ | | | | |
| Chroma | Single Node | | | ✅ | | | |
| PG Vector | Single Node | | | ✅ | | | |
| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | |
| vLLM | Hosted and Single Node | | ✅ | | | | |
| OpenAI | Hosted | | ✅ | | | | |
| Anthropic | Hosted | | ✅ | | | | |
| Gemini | Hosted | | ✅ | | | | |
| watsonx | Hosted | | ✅ | | | | |
| HuggingFace | Single Node | | | | | | ✅ |
| TorchTune | Single Node | | | | | | ✅ |
| NVIDIA NEMO | Hosted | | | | | | ✅ |
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
| Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | |
| AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | |
| Together | Hosted | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | |
| Ollama | Single Node | | :heavy_check_mark: | | |
| TGI | Hosted and Single Node | | :heavy_check_mark: | | |
| Chroma | Single Node | | | :heavy_check_mark: | | |
| PG Vector | Single Node | | | :heavy_check_mark: | | |
| PyTorch ExecuTorch | On-device iOS | :heavy_check_mark: | :heavy_check_mark: | | |
### Distributions
A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider implementations for each API component. Distributions make it easy to get started with a specific deployment scenario - you can begin with a local development setup (eg. ollama) and seamlessly transition to production (eg. Fireworks) without changing your application code. Here are some of the distributions we support:
| **Distribution** | **Llama Stack Docker** | Start This Distribution |
|:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) |
| SambaNova | [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html) |
| Cerebras | [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html) |
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html) |
| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html) |
| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html) |
| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html) |
| vLLM | [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) |
|:----------------: |:------------------------------------------: |:-----------------------: |
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) |
| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) |
| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) |
| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/together.html) |
| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/remote_hosted_distro/fireworks.html) |
## Installation
### Documentation
You have two ways to install this repository:
Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
1. **Install as a package**:
You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command:
```bash
pip install llama-stack
```
* CLI references
* [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
* [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
* Getting Started
* [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
2. **Install from source**:
If you prefer to install from the source code, follow these steps:
```bash
mkdir -p ~/local
cd ~/local
git clone git@github.com:meta-llama/llama-stack.git
conda create -n stack python=3.10
conda activate stack
cd llama-stack
$CONDA_PREFIX/bin/pip install -e .
```
## Documentations
Please checkout our [Documentations](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
* [CLI reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
* Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
* [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
* Quick guide to start a Llama Stack server.
* [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
* The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
* A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
* The [Zero2Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
* [Contributing](CONTRIBUTING.md)
* [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.
* [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/api_providers/new_api_provider.html) to walk-through how to add a new API provider.
### Llama Stack Client SDKs
## Llama Stack Client SDK
| **Language** | **Client SDK** | **Package** |
| :----: | :----: | :----: |
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
| Typescript | [llama-stack-client-typescript](https://github.com/meta-llama/llama-stack-client-typescript) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.

View file

@ -0,0 +1 @@
../../llama_stack/templates/bedrock/build.yaml

View file

@ -0,0 +1,15 @@
services:
llamastack:
image: distribution-bedrock
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-bedrock.yaml
ports:
- "5000:5000"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1 @@
../../llama_stack/templates/bedrock/run.yaml

View file

@ -0,0 +1,50 @@
services:
text-generation-inference:
image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- "5009:5009"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0,1,2,3,4
- NUM_SHARD=4
- MAX_BATCH_PREFILL_TOKENS=32768
- MAX_INPUT_TOKENS=8000
- MAX_TOTAL_TOKENS=8192
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: all
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
text-generation-inference:
condition: service_healthy
image: llamastack/distribution-tgi
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to TGI run.yaml file
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
# Hack: wait for TGI server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,44 @@
version: '2'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:80
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::faiss
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1,315 @@
{
"hf-serverless": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"together": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"together",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"vllm-gpu": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"vllm",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"remote-vllm": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"openai",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"fireworks": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"tgi": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"bedrock": [
"aiosqlite",
"blobfile",
"boto3",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-gpu": [
"accelerate",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"fairscale",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"lm-format-enforcer",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"torch",
"torchvision",
"tqdm",
"transformers",
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-quantized-gpu": [
"accelerate",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"fairscale",
"faiss-cpu",
"fastapi",
"fbgemm-gpu",
"fire",
"httpx",
"lm-format-enforcer",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"torch",
"torchao==0.5.0",
"torchvision",
"tqdm",
"transformers",
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"ollama": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"ollama",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"hf-endpoint": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
]
}

View file

@ -0,0 +1 @@
../../llama_stack/templates/fireworks/build.yaml

View file

@ -0,0 +1,16 @@
services:
llamastack:
image: llamastack/distribution-fireworks
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-fireworks.yaml
ports:
- "5000:5000"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1 @@
../../llama_stack/templates/fireworks/run.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-gpu/build.yaml

View file

@ -0,0 +1,34 @@
services:
llamastack:
image: llamastack/distribution-meta-reference-gpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
runtime: nvidia
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-gpu/run.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml

View file

@ -0,0 +1,35 @@
services:
llamastack:
image: llamastack/distribution-meta-reference-quantized-gpu
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,58 @@
version: '2'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: meta0
provider_type: inline::meta-reference-quantized
config:
model: Llama3.2-3B-Instruct:int4-qlora-eo8
quantization:
type: int4
torch_seed: null
max_seq_len: 2048
max_batch_size: 1
- provider_id: meta1
provider_type: inline::meta-reference-quantized
config:
# not a quantized model !
model: Llama-Guard-3-1B
quantization: null
torch_seed: null
max_seq_len: 2048
max_batch_size: 1
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
- provider_id: meta1
provider_type: inline::prompt-guard
config:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -0,0 +1 @@
../../llama_stack/templates/ollama/build.yaml

View file

@ -0,0 +1,71 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: ${NETWORK_MODE:-bridge}
volumes:
- ~/.ollama:/root/.ollama
ports:
- "11434:11434"
environment:
OLLAMA_DEBUG: 1
command: []
deploy:
resources:
limits:
memory: 8G # Set maximum memory
reservations:
memory: 8G # Set minimum memory reservation
# healthcheck:
# # ugh, no CURL in ollama image
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
# interval: 10s
# timeout: 5s
# retries: 5
ollama-init:
image: ollama/ollama:latest
depends_on:
- ollama
# condition: service_healthy
network_mode: ${NETWORK_MODE:-bridge}
environment:
- OLLAMA_HOST=ollama
- INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
volumes:
- ~/.ollama:/root/.ollama
- ./pull-models.sh:/pull-models.sh
entrypoint: ["/pull-models.sh"]
llamastack:
depends_on:
ollama:
condition: service_started
ollama-init:
condition: service_started
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
network_mode: ${NETWORK_MODE:-bridge}
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ~/local/llama-stack/:/app/llama-stack-source
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
ports:
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
environment:
- INFERENCE_MODEL=${INFERENCE_MODEL}
- SAFETY_MODEL=${SAFETY_MODEL:-}
- OLLAMA_URL=http://ollama:11434
entrypoint: >
python -m llama_stack.distribution.server.server /root/my-run.yaml \
--port ${LLAMA_STACK_PORT:-5001}
deploy:
restart_policy:
condition: on-failure
delay: 10s
max_attempts: 3
window: 60s
volumes:
ollama:
ollama-init:
llamastack:

View file

@ -0,0 +1,18 @@
#!/bin/sh
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
echo "Preloading $model..."
if ! ollama run "$model"; then
echo "Failed to pull and run $model"
exit 1
fi
done
echo "All models pulled successfully"

View file

@ -0,0 +1 @@
../../llama_stack/templates/ollama/run-with-safety.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/ollama/run.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/remote-vllm/build.yaml

View file

@ -0,0 +1,100 @@
services:
vllm-inference:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port ${VLLM_INFERENCE_PORT:-5100}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
# A little trick:
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
vllm-${VLLM_SAFETY_MODEL:+safety}:
image: vllm/vllm-openai:latest
volumes:
- $HOME/.cache/huggingface:/root/.cache/huggingface
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
command: >
--gpu-memory-utilization 0.75
--model ${VLLM_SAFETY_MODEL}
--enforce-eager
--max-model-len 8192
--max-num-seqs 16
--port ${VLLM_SAFETY_PORT:-5101}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
interval: 30s
timeout: 10s
retries: 5
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
- vllm-inference:
condition: service_healthy
- vllm-${VLLM_SAFETY_MODEL:+safety}:
condition: service_healthy
# image: llamastack/distribution-remote-vllm
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
volumes:
- ~/.llama:/root/.llama
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
network_mode: ${NETWORK_MODE:-bridged}
environment:
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
- MAX_TOKENS=${MAX_TOKENS:-4096}
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
ports:
- "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
# Hack: wait for vLLM server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
vllm-inference:
vllm-safety:
llamastack:

View file

@ -0,0 +1 @@
../../llama_stack/templates/remote-vllm/run-with-safety.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/remote-vllm/run.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/tgi/build.yaml

View file

@ -0,0 +1,103 @@
services:
tgi-inference:
image: ghcr.io/huggingface/text-generation-inference:latest
volumes:
- $HOME/.cache/huggingface:/data
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
- HF_TOKEN=$HF_TOKEN
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
--port ${TGI_INFERENCE_PORT:-8080}
--cuda-memory-fraction 0.75
healthcheck:
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
interval: 5s
timeout: 5s
retries: 30
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
tgi-${TGI_SAFETY_MODEL:+safety}:
image: ghcr.io/huggingface/text-generation-inference:latest
volumes:
- $HOME/.cache/huggingface:/data
network_mode: ${NETWORK_MODE:-bridged}
ports:
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
- HF_TOKEN=$HF_TOKEN
- HF_HOME=/data
- HF_DATASETS_CACHE=/data
- HF_MODULES_CACHE=/data
- HF_HUB_CACHE=/data
command: >
--dtype bfloat16
--usage-stats off
--sharded false
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
--port ${TGI_SAFETY_PORT:-8081}
--cuda-memory-fraction 0.75
healthcheck:
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
interval: 5s
timeout: 5s
retries: 30
deploy:
resources:
reservations:
devices:
- driver: nvidia
capabilities: [gpu]
runtime: nvidia
llamastack:
depends_on:
tgi-inference:
condition: service_healthy
tgi-${TGI_SAFETY_MODEL:+safety}:
condition: service_healthy
image: llamastack/distribution-tgi:test-0.0.52rc3
network_mode: ${NETWORK_MODE:-bridged}
volumes:
- ~/.llama:/root/.llama
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
ports:
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
# Hack: wait for TGI server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
environment:
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
volumes:
tgi-inference:
tgi-safety:
llamastack:

View file

@ -0,0 +1 @@
../../llama_stack/templates/tgi/run-with-safety.yaml

1
distributions/tgi/run.yaml Symbolic link
View file

@ -0,0 +1 @@
../../llama_stack/templates/tgi/run.yaml

View file

@ -0,0 +1,65 @@
# Together Distribution
### Connect to a Llama Stack Together Endpoint
- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
The `llamastack/distribution-together` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | remote::together | meta-reference | meta-reference, remote::weaviate | meta-reference | meta-reference |
### Docker: Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have an hosted endpoint at Together with API Key.
```
$ cd distributions/together
$ ls
compose.yaml run.yaml
$ docker compose up
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
```
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: <optional api key>
```
### Conda llama stack run (Single Node CPU)
```bash
llama stack build --template together --image-type conda
# -- modify run.yaml to a valid Together server endpoint
llama stack run ./run.yaml
```
### (Optional) Update Model Serving Configuration
Use `llama-stack-client models list` to check the available models served by together.
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
```

View file

@ -0,0 +1 @@
../../llama_stack/templates/together/build.yaml

View file

@ -0,0 +1,16 @@
services:
llamastack:
image: llamastack/distribution-together
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-together.yaml
ports:
- "5000:5000"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1 @@
../../llama_stack/templates/together/run.yaml

View file

@ -0,0 +1 @@
../../llama_stack/templates/inline-vllm/build.yaml

View file

@ -0,0 +1,35 @@
services:
llamastack:
image: llamastack/distribution-inline-vllm
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,66 @@
version: '2'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: vllm-inference
provider_type: inline::vllm
config:
model: Llama3.2-3B-Instruct
tensor_parallel_size: 1
gpu_memory_utilization: 0.4
enforce_eager: true
max_tokens: 4096
- provider_id: vllm-inference-safety
provider_type: inline::vllm
config:
model: Llama-Guard-3-1B
tensor_parallel_size: 1
gpu_memory_utilization: 0.2
enforce_eager: true
max_tokens: 4096
safety:
- provider_id: meta0
provider_type: inline::llama-guard
config:
model: Llama-Guard-3-1B
excluded_categories: []
# Uncomment to use prompt guard
# - provider_id: meta1
# provider_type: inline::prompt-guard
# config:
# model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}
# Uncomment to use pgvector
# - provider_id: pgvector
# provider_type: remote::pgvector
# config:
# host: 127.0.0.1
# port: 5432
# db: postgres
# user: postgres
# password: mysecretpassword
agents:
- provider_id: meta0
provider_type: inline::meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/agents_store.db
telemetry:
- provider_id: meta0
provider_type: inline::meta-reference
config: {}

View file

@ -12,24 +12,3 @@
.wy-side-nav-search {
background-color: transparent !important;
}
.hide-title h1 {
display: none;
}
h2, h3, h4 {
font-weight: normal;
}
html[data-theme="dark"] .rst-content div[class^="highlight"] {
background-color: #0b0b0b;
}
pre {
white-space: pre-wrap !important;
word-break: break-all;
}
[data-theme="dark"] .mermaid {
background-color: #f4f4f6 !important;
border-radius: 6px;
padding: 0.5em;
}

View file

@ -1,32 +0,0 @@
document.addEventListener("DOMContentLoaded", function () {
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
const htmlElement = document.documentElement;
// Check if theme is saved in localStorage
const savedTheme = localStorage.getItem("sphinx-rtd-theme");
if (savedTheme) {
// Use the saved theme preference
htmlElement.setAttribute("data-theme", savedTheme);
document.body.classList.toggle("dark", savedTheme === "dark");
} else {
// Fall back to system preference
const theme = prefersDark ? "dark" : "light";
htmlElement.setAttribute("data-theme", theme);
document.body.classList.toggle("dark", theme === "dark");
// Save initial preference
localStorage.setItem("sphinx-rtd-theme", theme);
}
// Listen for theme changes from the existing toggle
const observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
if (mutation.attributeName === "data-theme") {
const currentTheme = htmlElement.getAttribute("data-theme");
localStorage.setItem("sphinx-rtd-theme", currentTheme);
}
});
});
observer.observe(htmlElement, { attributes: true });
});

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

View file

@ -1,24 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import os
import time
def pytest_collection_modifyitems(items):
for item in items:
item.name = item.name.replace(' ', '_')
def pytest_runtest_teardown(item):
interval_seconds = os.getenv("LLAMA_STACK_TEST_INTERVAL_SECONDS")
if interval_seconds:
time.sleep(float(interval_seconds))
def pytest_configure(config):
config.option.tbstyle = "short"
config.option.disable_warnings = True

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1 +1,9 @@
The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/distribution/server/endpoints.py` using the `generate.py` utility.
The RFC Specification (OpenAPI format) is generated from the set of API endpoints located in `llama_stack/[<subdir>]/api/endpoints.py` using the `generate.py` utility.
Please install the following packages before running the script:
```
pip install python-openapi json-strong-typing fire PyYAML llama-models
```
Then simply run `sh run_openapi_generator.sh <OUTPUT_DIR>`

View file

@ -12,46 +12,41 @@
from datetime import datetime
from pathlib import Path
import sys
import fire
import ruamel.yaml as yaml
import fire
import yaml
from llama_models import schema_utils
from .pyopenapi.options import Options
from .pyopenapi.specification import Info, Server
from .pyopenapi.utility import Specification
# We do some monkey-patching to ensure our definitions only use the minimal
# (json_schema_type, webmethod) definitions from the llama_models package. For
# generation though, we need the full definitions and implementations from the
# (json-strong-typing) package.
from .strong_typing.schema import json_schema_type
schema_utils.json_schema_type = json_schema_type
# this line needs to be here to ensure json_schema_type has been altered before
# the imports use the annotation
from llama_stack.apis.version import LLAMA_STACK_API_VERSION # noqa: E402
from llama_stack.distribution.stack import LlamaStack # noqa: E402
from .pyopenapi.options import Options # noqa: E402
from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification, validate_api # noqa: E402
def str_presenter(dumper, data):
if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
"#/components/schemas/"
):
style = None
else:
style = ">" if "\n" in data or len(data) > 40 else None
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
def main(output_dir: str):
output_dir = Path(output_dir)
if not output_dir.exists():
raise ValueError(f"Directory {output_dir} does not exist")
# Validate API protocols before generating spec
return_type_errors = validate_api()
if return_type_errors:
print("\nAPI Method Return Type Validation Errors:\n")
for error in return_type_errors:
print(error, file=sys.stderr)
sys.exit(1)
now = str(datetime.now())
print(
"Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
)
print("")
spec = Specification(
LlamaStack,
Options(
@ -63,25 +58,11 @@ def main(output_dir: str):
a set of endpoints and their corresponding interfaces that are tailored to
best leverage Llama Models.""",
),
include_standard_error_responses=True,
),
)
with open(output_dir / "llama-stack-spec.yaml", "w", encoding="utf-8") as fp:
y = yaml.YAML()
y.default_flow_style = False
y.block_seq_indent = 2
y.map_indent = 2
y.sequence_indent = 4
y.sequence_dash_offset = 2
y.width = 80
y.allow_unicode = True
y.representer.add_representer(str, str_presenter)
y.dump(
spec.get_json(),
fp,
)
yaml.dump(spec.get_json(), fp, allow_unicode=True)
with open(output_dir / "llama-stack-spec.html", "w") as fp:
spec.write_html(fp, pretty_print=True)

View file

@ -4,17 +4,15 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import collections
import hashlib
import ipaddress
import types
import typing
from dataclasses import make_dataclass
from typing import Any, Dict, Set, Union
from llama_stack.apis.datatypes import Error
from llama_stack.strong_typing.core import JsonType
from llama_stack.strong_typing.docstring import Docstring, parse_type
from llama_stack.strong_typing.inspection import (
from ..strong_typing.core import JsonType
from ..strong_typing.docstring import Docstring, parse_type
from ..strong_typing.inspection import (
is_generic_list,
is_type_optional,
is_type_union,
@ -22,15 +20,15 @@ from llama_stack.strong_typing.inspection import (
unwrap_optional_type,
unwrap_union_types,
)
from llama_stack.strong_typing.name import python_type_to_name
from llama_stack.strong_typing.schema import (
from ..strong_typing.name import python_type_to_name
from ..strong_typing.schema import (
get_schema_identifier,
JsonSchemaGenerator,
register_schema,
Schema,
SchemaOptions,
)
from llama_stack.strong_typing.serialization import json_dump_string, object_to_json
from ..strong_typing.serialization import json_dump_string, object_to_json
from .operations import (
EndpointOperation,
@ -179,37 +177,20 @@ class ContentBuilder:
) -> Dict[str, MediaType]:
"Creates the content subtree for a request or response."
def is_iterator_type(t):
return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
def get_media_type(t):
if is_generic_list(t):
return "application/jsonl"
elif is_iterator_type(t):
return "text/event-stream"
def has_iterator_type(t):
if typing.get_origin(t) is typing.Union:
return any(has_iterator_type(a) for a in typing.get_args(t))
else:
return "application/json"
if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
media_types = []
item_types = []
for x in typing.get_args(payload_type):
media_types.append(get_media_type(x))
item_types.append(x)
if len(set(media_types)) == 1:
# all types have the same media type
return {media_types[0]: self.build_media_type(payload_type, examples)}
else:
# different types have different media types
return {
media_type: self.build_media_type(item_type, examples)
for media_type, item_type in zip(media_types, item_types)
}
# TODO: needs a proper fix where we let all types correctly flow upwards
# and then test against AsyncIterator
return "StreamChunk" in str(t)
if is_generic_list(payload_type):
media_type = "application/jsonl"
item_type = unwrap_generic_list(payload_type)
elif has_iterator_type(payload_type):
item_type = payload_type
media_type = "text/event-stream"
else:
media_type = "application/json"
item_type = payload_type
@ -252,9 +233,7 @@ class ContentBuilder:
value = sample_transformer(object_to_json(example))
hash_string = (
hashlib.sha256(json_dump_string(value).encode("utf-8"))
.digest()
.hex()[:16]
hashlib.md5(json_dump_string(value).encode("utf-8")).digest().hex()
)
name = f"ex-{hash_string}"
@ -297,20 +276,6 @@ class StatusResponse:
examples: List[Any] = dataclasses.field(default_factory=list)
def create_docstring_for_request(
request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
) -> str:
"""Creates a ReST-style docstring for a dynamically generated request dataclass."""
lines = ["\n"] # Short description
# Add parameter documentation in ReST format
for name, type_ in fields:
desc = doc_params.get(name, "")
lines.append(f":param {name}: {desc}")
return "\n".join(lines)
class ResponseBuilder:
content_builder: ContentBuilder
@ -437,90 +402,19 @@ class Generator:
self.schema_builder = SchemaBuilder(schema_generator)
self.responses = {}
# Create standard error responses
self._create_standard_error_responses()
def _create_standard_error_responses(self) -> None:
"""
Creates standard error responses that can be reused across operations.
These will be added to the components.responses section of the OpenAPI document.
"""
# Get the Error schema
error_schema = self.schema_builder.classdef_to_ref(Error)
# Create standard error responses
self.responses["BadRequest400"] = Response(
description="The request was invalid or malformed",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 400,
"title": "Bad Request",
"detail": "The request was invalid or malformed",
},
)
},
)
self.responses["TooManyRequests429"] = Response(
description="The client has sent too many requests in a given amount of time",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 429,
"title": "Too Many Requests",
"detail": "You have exceeded the rate limit. Please try again later.",
},
)
},
)
self.responses["InternalServerError500"] = Response(
description="The server encountered an unexpected error",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 500,
"title": "Internal Server Error",
"detail": "An unexpected error occurred. Our team has been notified.",
},
)
},
)
# Add a default error response for any unhandled error cases
self.responses["DefaultError"] = Response(
description="An unexpected error occurred",
content={
"application/json": MediaType(
schema=error_schema,
example={
"status": 0,
"title": "Error",
"detail": "An unexpected error occurred",
},
)
},
)
def _build_type_tag(self, ref: str, schema: Schema) -> Tag:
# Don't include schema definition in the tag description because for one,
# it is not very valuable and for another, it causes string formatting
# discrepancies via the Stainless Studio.
#
# definition = f'<SchemaDefinition schemaRef="#/components/schemas/{ref}" />'
definition = f'<SchemaDefinition schemaRef="#/components/schemas/{ref}" />'
title = typing.cast(str, schema.get("title"))
description = typing.cast(str, schema.get("description"))
return Tag(
name=ref,
description="\n\n".join(s for s in (title, description) if s is not None),
description="\n\n".join(
s for s in (title, description, definition) if s is not None
),
)
def _build_extra_tag_groups(
self, extra_types: Dict[str, Dict[str, type]]
self, extra_types: Dict[str, List[type]]
) -> Dict[str, List[Tag]]:
"""
Creates a dictionary of tag group captions as keys, and tag lists as values.
@ -533,8 +427,9 @@ class Generator:
for category_name, category_items in extra_types.items():
tag_list: List[Tag] = []
for name, extra_type in category_items.items():
schema = self.schema_builder.classdef_to_schema(extra_type)
for extra_type in category_items:
name = python_type_to_name(extra_type)
schema = self.schema_builder.classdef_to_named_schema(name, extra_type)
tag_list.append(self._build_type_tag(name, schema))
if tag_list:
@ -551,10 +446,6 @@ class Generator:
op.defining_class.__name__ = f"{op.defining_class.__name__} (Coming Soon)"
print(op.defining_class.__name__)
# TODO (xiyan): temporary fix for datasetio inner impl + datasets api
# if op.defining_class.__name__ in ["DatasetIO"]:
# op.defining_class.__name__ = "Datasets"
doc_string = parse_type(op.func_ref)
doc_params = dict(
(param.name, param.description) for param in doc_string.params.values()
@ -593,55 +484,27 @@ class Generator:
# parameters passed anywhere
parameters = path_parameters + query_parameters
webmethod = getattr(op.func_ref, "__webmethod__", None)
raw_bytes_request_body = False
if webmethod:
raw_bytes_request_body = getattr(webmethod, "raw_bytes_request_body", False)
# data passed in request body as raw bytes cannot have request parameters
if raw_bytes_request_body and op.request_params:
raise ValueError(
"Cannot have both raw bytes request body and request parameters"
parameters += [
Parameter(
name="X-LlamaStack-ProviderData",
in_=ParameterLocation.Header,
description="JSON-encoded provider data which will be made available to the adapter servicing the API",
required=False,
schema=self.schema_builder.classdef_to_ref(str),
)
]
# data passed in request body as raw bytes
if raw_bytes_request_body:
requestBody = RequestBody(
content={
"application/octet-stream": {
"schema": {
"type": "string",
"format": "binary",
}
}
},
required=True,
)
# data passed in payload as JSON and mapped to request parameters
elif op.request_params:
# data passed in payload
if op.request_params:
builder = ContentBuilder(self.schema_builder)
first = next(iter(op.request_params))
request_name, request_type = first
from dataclasses import make_dataclass
op_name = "".join(word.capitalize() for word in op.name.split("_"))
request_name = f"{op_name}Request"
fields = [
(
name,
type_,
)
for name, type_ in op.request_params
]
request_type = make_dataclass(
request_name,
fields,
namespace={
"__doc__": create_docstring_for_request(
request_name, fields, doc_params
)
},
)
request_type = make_dataclass(request_name, op.request_params)
requestBody = RequestBody(
content={
@ -665,6 +528,7 @@ class Generator:
success_type_descriptions = {
item: doc_string.short_description
for item, doc_string in success_type_docstring.items()
if doc_string.short_description
}
else:
# use return type as a single response type
@ -723,19 +587,6 @@ class Generator:
)
responses.update(response_builder.build_response(response_options))
assert len(responses.keys()) > 0, f"No responses found for {op.name}"
# Add standard error response references
if self.options.include_standard_error_responses:
if "400" not in responses:
responses["400"] = ResponseRef("BadRequest400")
if "429" not in responses:
responses["429"] = ResponseRef("TooManyRequests429")
if "500" not in responses:
responses["500"] = ResponseRef("InternalServerError500")
if "default" not in responses:
responses["default"] = ResponseRef("DefaultError")
if op.event_type is not None:
builder = ContentBuilder(self.schema_builder)
callbacks = {
@ -754,20 +605,14 @@ class Generator:
else:
callbacks = None
description = "\n".join(
filter(None, [doc_string.short_description, doc_string.long_description])
)
return Operation(
tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
summary=None,
# summary=doc_string.short_description,
description=description,
tags=[op.defining_class.__name__],
summary=doc_string.short_description,
description=doc_string.long_description,
parameters=parameters,
requestBody=requestBody,
responses=responses,
callbacks=callbacks,
deprecated=True if "DEPRECATED" in op.func_name else None,
security=[] if op.public else None,
)
@ -795,7 +640,6 @@ class Generator:
raise NotImplementedError(f"unknown HTTP method: {op.http_method}")
route = op.get_route()
route = route.replace(":path", "")
print(f"route: {route}")
if route in paths:
paths[route].update(pathItem)
@ -805,8 +649,6 @@ class Generator:
operation_tags: List[Tag] = []
for cls in endpoint_classes:
doc_string = parse_type(cls)
if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
continue
operation_tags.append(
Tag(
name=cls.__name__,
@ -815,6 +657,12 @@ class Generator:
)
)
# types that are produced/consumed by operations
type_tags = [
self._build_type_tag(ref, schema)
for ref, schema in self.schema_builder.schemas.items()
]
# types that are emitted by events
event_tags: List[Tag] = []
events = get_endpoint_events(self.endpoint)
@ -841,6 +689,7 @@ class Generator:
# list all operations and types
tags: List[Tag] = []
tags.extend(operation_tags)
tags.extend(type_tags)
tags.extend(event_tags)
for extra_tag_group in extra_tag_groups.values():
tags.extend(extra_tag_group)
@ -855,6 +704,13 @@ class Generator:
tags=sorted(tag.name for tag in operation_tags),
)
)
if type_tags:
tag_groups.append(
TagGroup(
name=self.options.map("Types"),
tags=sorted(tag.name for tag in type_tags),
)
)
if event_tags:
tag_groups.append(
TagGroup(
@ -865,7 +721,7 @@ class Generator:
for caption, extra_tag_group in extra_tag_groups.items():
tag_groups.append(
TagGroup(
name=caption,
name=self.options.map(caption),
tags=sorted(tag.name for tag in extra_tag_group),
)
)

View file

@ -8,6 +8,7 @@ import collections.abc
import enum
import inspect
import typing
import uuid
from dataclasses import dataclass
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Union
@ -15,7 +16,12 @@ from llama_stack.apis.version import LLAMA_STACK_API_VERSION
from termcolor import colored
from llama_stack.strong_typing.inspection import get_signature
from ..strong_typing.inspection import (
get_signature,
is_type_enum,
is_type_optional,
unwrap_optional_type,
)
def split_prefix(
@ -107,6 +113,9 @@ class EndpointOperation:
def get_route(self) -> str:
if self.route is not None:
assert (
"_" not in self.route
), f"route should not contain underscores: {self.route}"
return "/".join(["", LLAMA_STACK_API_VERSION, self.route.lstrip("/")])
route_parts = ["", LLAMA_STACK_API_VERSION, self.name]
@ -130,8 +139,6 @@ class _FormatParameterExtractor:
def _get_route_parameters(route: str) -> List[str]:
extractor = _FormatParameterExtractor()
# Replace all occurrences of ":path" with empty string
route = route.replace(":path", "")
route.format_map(extractor)
return extractor.keys
@ -150,14 +157,7 @@ def _get_endpoint_functions(
print(f"Processing {colored(func_name, 'white')}...")
operation_name = func_name
if webmethod.method == "GET":
prefix = "get"
elif webmethod.method == "DELETE":
prefix = "delete"
elif webmethod.method == "POST":
prefix = "post"
elif operation_name.startswith("get_") or operation_name.endswith("/get"):
if operation_name.startswith("get_") or operation_name.endswith("/get"):
prefix = "get"
elif (
operation_name.startswith("delete_")
@ -166,6 +166,11 @@ def _get_endpoint_functions(
or operation_name.endswith("/remove")
):
prefix = "delete"
else:
if webmethod.method == "GET":
prefix = "get"
elif webmethod.method == "DELETE":
prefix = "delete"
else:
# by default everything else is a POST
prefix = "post"
@ -176,16 +181,10 @@ def _get_endpoint_functions(
def _get_defining_class(member_fn: str, derived_cls: type) -> type:
"Find the class in which a member function is first defined in a class inheritance hierarchy."
# This import must be dynamic here
from llama_stack.apis.tools import RAGToolRuntime, ToolRuntime
# iterate in reverse member resolution order to find most specific class first
for cls in reversed(inspect.getmro(derived_cls)):
for name, _ in inspect.getmembers(cls, inspect.isfunction):
if name == member_fn:
# HACK ALERT
if cls == RAGToolRuntime:
return ToolRuntime
return cls
raise ValidationError(
@ -266,15 +265,41 @@ def get_endpoint_operations(
f"parameter '{param_name}' in function '{func_name}' has no type annotation"
)
if prefix in ["get", "delete"]:
if route_params is not None and param_name in route_params:
if is_type_optional(param_type):
inner_type: type = unwrap_optional_type(param_type)
else:
inner_type = param_type
if prefix == "get" and (
inner_type is bool
or inner_type is int
or inner_type is float
or inner_type is str
or inner_type is uuid.UUID
or is_type_enum(inner_type)
):
if parameter.kind == inspect.Parameter.POSITIONAL_ONLY:
if route_params is not None and param_name not in route_params:
raise ValidationError(
f"positional parameter '{param_name}' absent from user-defined route '{route}' for function '{func_name}'"
)
# simple type maps to route path element, e.g. /study/{uuid}/{version}
path_params.append((param_name, param_type))
else:
if route_params is not None and param_name in route_params:
raise ValidationError(
f"query parameter '{param_name}' found in user-defined route '{route}' for function '{func_name}'"
)
# simple type maps to key=value pair in query string
query_params.append((param_name, param_type))
else:
if route_params is not None and param_name in route_params:
path_params.append((param_name, param_type))
else:
raise ValidationError(
f"user-defined route '{route}' for function '{func_name}' has parameter '{param_name}' of composite type: {param_type}"
)
request_params.append((param_name, param_type))
# check if function has explicit return type
@ -310,18 +335,19 @@ def get_endpoint_operations(
response_type = process_type(return_type)
# set HTTP request method based on type of request and presence of payload
if not request_params:
if prefix in ["delete", "remove"]:
http_method = HTTPMethod.DELETE
elif prefix == "post":
http_method = HTTPMethod.POST
elif prefix == "get":
else:
http_method = HTTPMethod.GET
elif prefix == "set":
else:
if prefix == "set":
http_method = HTTPMethod.PUT
elif prefix == "update":
http_method = HTTPMethod.PATCH
else:
raise ValidationError(f"unknown prefix {prefix}")
http_method = HTTPMethod.POST
result.append(
EndpointOperation(

View file

@ -35,7 +35,6 @@ class Options:
:param error_wrapper: True if errors are encapsulated in an error object wrapper.
:param property_description_fun: Custom transformation function to apply to class property documentation strings.
:param captions: User-defined captions for sections such as "Operations" or "Types", and (if applicable) groups of extra types.
:param include_standard_error_responses: Whether to include standard error responses (400, 429, 500, 503) in all operations.
"""
server: Server
@ -53,7 +52,6 @@ class Options:
error_wrapper: bool = False
property_description_fun: Optional[Callable[[type, str, str], str]] = None
captions: Optional[Dict[str, str]] = None
include_standard_error_responses: bool = True
default_captions: ClassVar[Dict[str, str]] = {
"Operations": "Operations",

View file

@ -9,7 +9,7 @@ import enum
from dataclasses import dataclass
from typing import Any, ClassVar, Dict, List, Optional, Union
from llama_stack.strong_typing.schema import JsonType, Schema, StrictJsonType
from ..strong_typing.schema import JsonType, Schema, StrictJsonType
URL = str
@ -78,7 +78,7 @@ class MediaType:
@dataclass
class RequestBody:
content: Dict[str, MediaType | Dict[str, Any]]
content: Dict[str, MediaType]
description: Optional[str] = None
required: Optional[bool] = None
@ -117,7 +117,6 @@ class Operation:
requestBody: Optional[RequestBody] = None
callbacks: Optional[Dict[str, "Callback"]] = None
security: Optional[List["SecurityRequirement"]] = None
deprecated: Optional[bool] = None
@dataclass

View file

@ -6,36 +6,36 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>OpenAPI specification</title>
<link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
<script type="module" src="https://cdn.jsdelivr.net/npm/@stoplight/elements/web-components.min.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@stoplight/elements/styles.min.css">
<style>
body {
margin: 0;
padding: 0;
height: 100vh;
}
elements-api {
height: 100%;
}
</style>
</head>
<body>
<elements-api id="openapi-container" router="hash" layout="sidebar" hideExport="true"
hideInternal="true"></elements-api>
<script>
<script defer="defer" src="https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js"></script>
<script defer="defer">
document.addEventListener("DOMContentLoaded", function () {
const spec = { /* OPENAPI_SPECIFICATION */ };
const element = document.getElementById("openapi-container");
element.apiDescriptionDocument = spec;
spec = { /* OPENAPI_SPECIFICATION */ };
options = {
downloadFileName: "openapi.json",
expandResponses: "200",
expandSingleSchemaField: true,
jsonSampleExpandLevel: "all",
schemaExpansionLevel: "all",
};
element = document.getElementById("openapi-container");
Redoc.init(spec, options, element);
if (spec.info && spec.info.title) {
document.title = spec.info.title;
}
});
</script>
</head>
<body>
<div id="openapi-container"></div>
</body>
</html>

View file

@ -6,18 +6,16 @@
import json
import typing
import inspect
from pathlib import Path
from typing import TextIO
from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
from llama_stack.distribution.resolver import api_protocol_map
from ..strong_typing.schema import object_to_json, StrictJsonType
from .generator import Generator
from .options import Options
from .specification import Document
THIS_DIR = Path(__file__).parent
@ -116,147 +114,3 @@ class Specification:
)
f.write(html)
def is_optional_type(type_: Any) -> bool:
"""Check if a type is Optional."""
origin = get_origin(type_)
args = get_args(type_)
return origin is Optional or (origin is Union and type(None) in args)
def _validate_api_method_return_type(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if is_optional_type(return_type):
return "returns Optional type where a return value is mandatory"
def _validate_api_method_doesnt_return_list(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if get_origin(return_type) is list:
return "returns a list where a PaginatedResponse or List*Response object is expected"
def _validate_api_delete_method_returns_none(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if return_type is not None and return_type is not type(None):
return "does not return None where None is mandatory"
def _validate_list_parameters_contain_data(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if not inspect.isclass(return_type):
return
if not return_type.__name__.startswith('List'):
return
if 'data' not in return_type.model_fields:
return "does not have a mandatory data attribute containing the list of objects"
def _validate_has_ellipsis(method) -> str | None:
source = inspect.getsource(method)
if "..." not in source and not "NotImplementedError" in source:
return "does not contain ellipsis (...) in its implementation"
def _validate_has_return_in_docstring(method) -> str | None:
source = inspect.getsource(method)
return_type = method.__annotations__.get('return')
if return_type is not None and return_type != type(None) and ":returns:" not in source:
return "does not have a ':returns:' in its docstring"
def _validate_has_params_in_docstring(method) -> str | None:
source = inspect.getsource(method)
sig = inspect.signature(method)
# Only check if the method has more than one parameter
if len(sig.parameters) > 1 and ":param" not in source:
return "does not have a ':param' in its docstring"
def _validate_has_no_return_none_in_docstring(method) -> str | None:
source = inspect.getsource(method)
return_type = method.__annotations__.get('return')
if return_type is None and ":returns: None" in source:
return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
def _validate_docstring_lines_end_with_dot(method) -> str | None:
docstring = inspect.getdoc(method)
if docstring is None:
return None
lines = docstring.split('\n')
for line in lines:
line = line.strip()
if line and not any(line.endswith(char) for char in '.:{}[]()",'):
return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
_VALIDATORS = {
"GET": [
_validate_api_method_return_type,
_validate_list_parameters_contain_data,
_validate_api_method_doesnt_return_list,
_validate_has_ellipsis,
_validate_has_return_in_docstring,
_validate_has_params_in_docstring,
_validate_docstring_lines_end_with_dot,
],
"DELETE": [
_validate_api_delete_method_returns_none,
_validate_has_ellipsis,
_validate_has_return_in_docstring,
_validate_has_params_in_docstring,
_validate_has_no_return_none_in_docstring
],
"POST": [
_validate_has_ellipsis,
_validate_has_return_in_docstring,
_validate_has_params_in_docstring,
_validate_has_no_return_none_in_docstring,
_validate_docstring_lines_end_with_dot,
],
}
def _get_methods_by_type(protocol, method_type: str):
members = inspect.getmembers(protocol, predicate=inspect.isfunction)
return {
method_name: method
for method_name, method in members
if (webmethod := getattr(method, '__webmethod__', None))
if webmethod and webmethod.method == method_type
}
def validate_api() -> List[str]:
"""Validate the API protocols."""
errors = []
protocols = api_protocol_map()
for target, validators in _VALIDATORS.items():
for protocol_name, protocol in protocols.items():
for validator in validators:
for method_name, method in _get_methods_by_type(protocol, target).items():
err = validator(method)
if err:
errors.append(f"Method {protocol_name}.{method_name} {err}")
return errors

View file

@ -28,5 +28,5 @@ if [ ${#missing_packages[@]} -ne 0 ]; then
fi
stack_dir=$(dirname $(dirname $THIS_DIR))
PYTHONPATH=$PYTHONPATH:$stack_dir \
python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static
models_dir=$(dirname $stack_dir)/llama-models
PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/resources

View file

@ -13,7 +13,7 @@ Type-safe data interchange for Python data classes.
import dataclasses
import sys
from dataclasses import is_dataclass
from typing import Callable, Dict, Optional, Type, TypeVar, Union, overload
from typing import Callable, Dict, Optional, overload, Type, TypeVar, Union
if sys.version_info >= (3, 9):
from typing import Annotated as Annotated
@ -42,7 +42,9 @@ def _compact_dataclass_repr(obj: object) -> str:
"""
if is_dataclass(obj):
arglist = ", ".join(repr(getattr(obj, field.name)) for field in dataclasses.fields(obj))
arglist = ", ".join(
repr(getattr(obj, field.name)) for field in dataclasses.fields(obj)
)
return f"{obj.__class__.__name__}({arglist})"
else:
return obj.__class__.__name__
@ -60,7 +62,9 @@ def typeannotation(cls: Type[T], /) -> Type[T]: ...
@overload
def typeannotation(cls: None, *, eq: bool = True, order: bool = False) -> Callable[[Type[T]], Type[T]]: ...
def typeannotation(
cls: None, *, eq: bool = True, order: bool = False
) -> Callable[[Type[T]], Type[T]]: ...
@dataclass_transform(eq_default=True, order_default=False)
@ -77,9 +81,7 @@ def typeannotation(
"""
def wrap(cls: Type[T]) -> Type[T]:
# mypy fails to equate bound-y functions (first argument interpreted as
# the bound object) with class methods, hence the `ignore` directive.
cls.__repr__ = _compact_dataclass_repr # type: ignore[method-assign]
setattr(cls, "__repr__", _compact_dataclass_repr)
if not dataclasses.is_dataclass(cls):
cls = dataclasses.dataclass( # type: ignore[call-overload]
cls,

View file

@ -22,13 +22,13 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Type, TypeVar, Uni
from .auxiliary import (
Alias,
Annotated,
MaxLength,
Precision,
float32,
float64,
int16,
int32,
int64,
MaxLength,
Precision,
)
from .core import JsonType, Schema
from .docstring import Docstring, DocstringParam
@ -122,16 +122,9 @@ class JsonSchemaAnyOf(JsonSchemaNode):
anyOf: List["JsonSchemaAny"]
@dataclass
class Discriminator:
propertyName: str
mapping: Dict[str, str]
@dataclass
class JsonSchemaOneOf(JsonSchemaNode):
oneOf: List["JsonSchemaAny"]
discriminator: Optional[Discriminator]
JsonSchemaAny = Union[
@ -181,13 +174,17 @@ def enum_values_to_type(
# assign the newly created type to the same module where the defining class is
enum_class.__module__ = module.__name__
enum_class.__doc__ = str(Docstring(short_description=title, long_description=description))
enum_class.__doc__ = str(
Docstring(short_description=title, long_description=description)
)
setattr(module, name, enum_class)
return enum.unique(enum_class)
def schema_to_type(schema: Schema, *, module: types.ModuleType, class_name: str) -> TypeLike:
def schema_to_type(
schema: Schema, *, module: types.ModuleType, class_name: str
) -> TypeLike:
"""
Creates a Python type from a JSON schema.
@ -196,14 +193,16 @@ def schema_to_type(schema: Schema, *, module: types.ModuleType, class_name: str)
:param class_name: The name assigned to the top-level class.
"""
top_node = typing.cast(JsonSchemaTopLevelObject, json_to_object(JsonSchemaTopLevelObject, schema))
top_node = typing.cast(
JsonSchemaTopLevelObject, json_to_object(JsonSchemaTopLevelObject, schema)
)
if top_node.definitions is not None:
for type_name, type_node in top_node.definitions.items():
type_def = node_to_typedef(module, type_name, type_node)
if type_def.default is not dataclasses.MISSING:
raise TypeError("disallowed: `default` for top-level type definitions")
type_def.type.__module__ = module.__name__
setattr(type_def.type, "__module__", module.__name__)
setattr(module, type_name, type_def.type)
return node_to_typedef(module, class_name, top_node).type
@ -222,7 +221,9 @@ def json_to_value(target_type: TypeLike, data: JsonType) -> Any:
return dataclasses.MISSING
def node_to_typedef(module: types.ModuleType, context: str, node: JsonSchemaNode) -> TypeDef:
def node_to_typedef(
module: types.ModuleType, context: str, node: JsonSchemaNode
) -> TypeDef:
if isinstance(node, JsonSchemaRef):
match_obj = re.match(r"^#/definitions/(\w+)$", node.ref)
if not match_obj:
@ -352,16 +353,22 @@ def node_to_typedef(module: types.ModuleType, context: str, node: JsonSchemaNode
prop_type = type_def.type
else:
prop_type = Union[(None, type_def.type)]
fields.append((prop_name, prop_type, dataclasses.field(default=type_def.default)))
fields.append(
(prop_name, prop_type, dataclasses.field(default=type_def.default))
)
prop_desc = prop_node.title or prop_node.description
if prop_desc is not None:
params[prop_name] = DocstringParam(prop_name, prop_desc)
fields.sort(key=lambda t: t[2].default is not dataclasses.MISSING)
if sys.version_info >= (3, 12):
class_type = dataclasses.make_dataclass(class_name, fields, module=module.__name__)
class_type = dataclasses.make_dataclass(
class_name, fields, module=module.__name__
)
else:
class_type = dataclasses.make_dataclass(class_name, fields, namespace={"__module__": module.__name__})
class_type = dataclasses.make_dataclass(
class_name, fields, namespace={"__module__": module.__name__}
)
class_type.__doc__ = str(
Docstring(
short_description=node.title,
@ -388,8 +395,12 @@ class SchemaFlatteningOptions:
recursive: bool = False
def flatten_schema(schema: Schema, *, options: Optional[SchemaFlatteningOptions] = None) -> Schema:
top_node = typing.cast(JsonSchemaTopLevelObject, json_to_object(JsonSchemaTopLevelObject, schema))
def flatten_schema(
schema: Schema, *, options: Optional[SchemaFlatteningOptions] = None
) -> Schema:
top_node = typing.cast(
JsonSchemaTopLevelObject, json_to_object(JsonSchemaTopLevelObject, schema)
)
flattener = SchemaFlattener(options)
obj = flattener.flatten(top_node)
return typing.cast(Schema, object_to_json(obj))
@ -424,7 +435,9 @@ class SchemaFlattener:
obj = prop
if obj.properties is not None:
if self.options.qualified_names:
target_props.update((f"{name}.{n}", p) for n, p in obj.properties.items())
target_props.update(
(f"{name}.{n}", p) for n, p in obj.properties.items()
)
else:
target_props.update(obj.properties.items())
if obj.required is not None:

Some files were not shown because too many files have changed in this diff Show more