Merge pull request #1 from meta-llama/main

Merging upstream changes
This commit is contained in:
cdgamarose-nv 2025-02-13 11:16:22 -08:00 committed by GitHub
commit eb1c5e86fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
389 changed files with 10041 additions and 7739 deletions

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan

View file

@ -1,6 +1,6 @@
name: 🐛 Bug Report
description: Create a report to help us reproduce and fix the bug
labels: ["bug"]
body:
- type: markdown
attributes:

12
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View file

@ -0,0 +1,12 @@
blank_issues_enabled: false
contact_links:
- name: Have you read the docs?
url: https://llama-stack.readthedocs.io/en/latest/index.html
about: Much help can be found in the docs
- name: Start a discussion
url: https://github.com/meta-llama/llama-stack/discussions/new
about: Start a discussion on a topic
- name: Chat on Discord
url: https://discord.gg/llama-stack
about: Maybe chatting with the community can help

View file

@ -1,6 +1,6 @@
name: 🚀 Feature request
description: Request a new llama-stack feature
labels: ["enhancement"]
body:
- type: textarea
id: feature-pitch

View file

@ -1,27 +1,10 @@
# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]
In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue.
- [ ] Addresses issue (#issue)
[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])
## Test Plan
[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]
Please describe:
- tests you ran to verify your changes with result summaries.
- provide instructions so it can be reproduced.
## Sources
Please link relevant resources if necessary.
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.
[//]: # (## Documentation)

View file

@ -11,10 +11,10 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: pip
@ -22,4 +22,8 @@ jobs:
**/requirements*.txt
.pre-commit-config.yaml
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1
- uses: pre-commit/action@v3.0.1
- name: Verify if there are any diff files after pre-commit
run: |
git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)

View file

@ -1,148 +0,0 @@
name: Docker Build and Publish
on:
workflow_dispatch:
inputs:
version:
description: 'TestPyPI or PyPI version to build (e.g., 0.0.63.dev20250114)'
required: true
type: string
jobs:
build-and-push:
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to the Container registry
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set version
id: version
run: |
if [ "${{ github.event_name }}" = "push" ]; then
echo "VERSION=0.0.63.dev51206766" >> $GITHUB_OUTPUT
else
echo "VERSION=${{ inputs.version }}" >> $GITHUB_OUTPUT
fi
- name: Check package version availability
run: |
# Function to check if version exists in a repository
check_version() {
local repo=$1
local VERSION_TO_CHECK=${{ steps.version.outputs.version }}
echo "Checking version $VERSION_TO_CHECK in $repo"
result=$(curl -s "https://$repo.org/pypi/llama-stack/json" | jq --arg v "$VERSION_TO_CHECK" '.releases | has($v)')
echo "Result: $result"
return $([ "$result" = "true" ])
}
# Check TestPyPI first, then PyPI
if check_version "test.pypi"; then
echo "Version ${{ steps.version.outputs.version }} found in TestPyPI"
echo "PYPI_SOURCE=testpypi" >> $GITHUB_ENV
elif check_version "pypi"; then
echo "Version ${{ steps.version.outputs.version }} found in PyPI"
echo "PYPI_SOURCE=pypi" >> $GITHUB_ENV
else
echo "Error: Version ${{ steps.version.outputs.version }} not found in either TestPyPI or PyPI"
exit 1
fi
- name: Install llama-stack
run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
if [ "${{ github.event_name }}" = "push" ]; then
pip install -e .
else
if [ "$PYPI_SOURCE" = "testpypi" ]; then
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple llama-stack==${{ steps.version.outputs.version }}
else
pip install llama-stack==${{ steps.version.outputs.version }}
fi
fi
- name: Build docker image
run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
echo "VERSION=${{ steps.version.outputs.version }}"
TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
for template in "${TEMPLATES[@]}"; do
if [ "$PYPI_SOURCE" = "testpypi" ]; then
TEST_PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
else
PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
fi
done
- name: List docker images
run: |
docker images
# TODO (xiyan): make the following 2 steps into a matrix and test all templates other than fireworks
- name: Start up built docker image
run: |
cd distributions/fireworks
if [ "$PYPI_SOURCE" = "testpypi" ]; then
sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:test-${{ steps.version.outputs.version }}|' ./compose.yaml
else
sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:${{ steps.version.outputs.version }}|' ./compose.yaml
fi
docker compose up -d
cd ..
# Wait for the container to start
timeout=300
while ! curl -s -f http://localhost:8321/v1/version > /dev/null && [ $timeout -gt 0 ]; do
echo "Waiting for endpoint to be available..."
sleep 5
timeout=$((timeout - 5))
done
if [ $timeout -le 0 ]; then
echo "Timeout waiting for endpoint to become available"
exit 1
fi
- name: Run simple models list test on docker server
run: |
curl http://localhost:8321/v1/models
# TODO (xiyan): figure out why client cannot find server but curl works
# - name: Run pytest on docker server
# run: |
# pip install pytest pytest-md-report
# export LLAMA_STACK_BASE_URL="http://localhost:8321"
# LLAMA_STACK_BASE_URL="http://localhost:8321" pytest -v tests/client-sdk/inference/test_inference.py --md-report --md-report-verbose=1
- name: Push to dockerhub
run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
echo "VERSION=${{ steps.version.outputs.version }}"
TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
for template in "${TEMPLATES[@]}"; do
if [ "$PYPI_SOURCE" = "testpypi" ]; then
docker tag distribution-$template:test-${{ steps.version.outputs.version }} llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
docker push llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
else
docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:${{ steps.version.outputs.version }}
docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:latest
docker push llamastack/distribution-$template:${{ steps.version.outputs.version }}
docker push llamastack/distribution-$template:latest
fi
done

View file

@ -1,244 +0,0 @@
name: Publish Python 🐍 distribution 📦 to TestPyPI
on:
workflow_dispatch: # Keep manual trigger
inputs:
version:
description: 'Version number (e.g. 0.0.63.dev20250111)'
required: true
type: string
schedule:
- cron: "0 0 * * *" # Run every day at midnight
jobs:
trigger-client-and-models-build:
name: Trigger llama-stack-client and llama-models build
runs-on: ubuntu-latest
outputs:
version: ${{ steps.version.outputs.version }}
client_run_id: ${{ steps.trigger-client.outputs.workflow_id }}
model_run_id: ${{ steps.trigger-models.outputs.workflow_id }}
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Get date
id: date
run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
- name: Compute version based on dispatch event
id: version
run: |
# Read base version from pyproject.toml
version=$(sed -n 's/.*version="\([^"]*\)".*/\1/p' setup.py)
if [ "${{ github.event_name }}" = "schedule" ]; then
echo "version=${version}.dev${{ steps.date.outputs.date }}" >> $GITHUB_OUTPUT
elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT
else
echo "version=${version}.dev$(shuf -i 10000000-99999999 -n 1)" >> $GITHUB_OUTPUT
fi
- name: Trigger llama-stack-client workflow
id: trigger-client
run: |
response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-stack-client-python/dispatches \
-H 'Accept: application/vnd.github.everest-preview+json' \
-H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
--data "{\"event_type\": \"build-client-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
-w "\n%{http_code}")
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" != "204" ]; then
echo "Failed to trigger client workflow"
exit 1
fi
# Get the run ID of the triggered workflow
sleep 5 # Wait for workflow to be created
run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs?event=repository_dispatch" \
| jq '.workflow_runs[0].id')
echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
- name: Trigger llama-models workflow
id: trigger-models
run: |
response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-models/dispatches \
-H 'Accept: application/vnd.github.everest-preview+json' \
-H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
--data "{\"event_type\": \"build-models-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
-w "\n%{http_code}")
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" != "204" ]; then
echo "Failed to trigger models workflow"
exit 1
fi
# Get the run ID of the triggered workflow
sleep 5 # Wait for workflow to be created
run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-models/actions/runs?event=repository_dispatch" \
| jq '.workflow_runs[0].id')
echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
wait-for-workflows:
name: Wait for triggered workflows
needs: trigger-client-and-models-build
runs-on: ubuntu-latest
steps:
- name: Wait for client workflow
run: |
while true; do
status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
| jq -r '.status')
conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
| jq -r '.conclusion')
echo "llama-stack-client-python workflow status: $status, conclusion: $conclusion"
if [ "$status" = "completed" ]; then
if [ "$conclusion" != "success" ]; then
echo "llama-stack-client-python workflow failed"
exit 1
fi
break
fi
sleep 10
done
- name: Wait for models workflow
run: |
while true; do
status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
| jq -r '.status')
conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
| jq -r '.conclusion')
echo "llama-models workflow status: $status, conclusion: $conclusion"
if [ "$status" = "completed" ]; then
if [ "$conclusion" != "success" ]; then
echo "llama-models workflow failed"
exit 1
fi
break
fi
sleep 10
done
build:
name: Build distribution 📦
needs:
- wait-for-workflows
- trigger-client-and-models-build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Get date
id: date
run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
- name: Update version for nightly
run: |
sed -i 's/version="\([^"]*\)"/version="${{ needs.trigger-client-and-models-build.outputs.version }}"/' setup.py
sed -i 's/llama-stack-client>=\([^"]*\)/llama-stack-client==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
sed -i 's/llama-models>=\([^"]*\)/llama-models==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install pypa/build
run: >-
python3 -m
pip install
build
--user
- name: Build a binary wheel and a source tarball
run: python3 -m build
- name: Store the distribution packages
uses: actions/upload-artifact@v4
with:
name: python-package-distributions
path: dist/
publish-to-testpypi:
name: Publish Python 🐍 distribution 📦 to TestPyPI
needs:
- build
runs-on: ubuntu-latest
environment:
name: testrelease
url: https://test.pypi.org/p/llama-stack
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing
steps:
- name: Download all the dists
uses: actions/download-artifact@v4
with:
name: python-package-distributions
path: dist/
- name: Publish distribution 📦 to TestPyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
repository-url: https://test.pypi.org/legacy/
test-published-package:
name: Test published package
needs:
- publish-to-testpypi
- trigger-client-and-models-build
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Install the package
run: |
max_attempts=6
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Attempt $attempt of $max_attempts to install package..."
if pip install --no-cache --index-url https://pypi.org/simple/ --extra-index-url https://test.pypi.org/simple/ llama-stack==${{ needs.trigger-client-and-models-build.outputs.version }}; then
echo "Package installed successfully"
break
fi
if [ $attempt -ge $max_attempts ]; then
echo "Failed to install package after $max_attempts attempts"
exit 1
fi
attempt=$((attempt + 1))
sleep 10
done
- name: Test the package versions
run: |
pip list | grep llama_
- name: Test CLI commands
run: |
llama model list
llama stack build --list-templates
llama model prompt-format -m Llama3.2-11B-Vision-Instruct
llama stack list-apis
llama stack list-providers inference
llama stack list-providers telemetry
- name: Test Notebook
run: |
pip install pytest nbval
llama stack build --template together --image-type venv
pytest -v -s --nbval-lax ./docs/getting_started.ipynb
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
# TODO: add trigger for integration test workflow & docker builds

21
.github/workflows/semantic-pr.yml vendored Normal file
View file

@ -0,0 +1,21 @@
name: Check semantic PR titles
on:
pull_request_target:
types:
- opened
- edited
- reopened
- synchronize
permissions:
contents: read
jobs:
title-check:
runs-on: ubuntu-latest
steps:
- name: Check PR Title's semantic conformance
uses: amannn/action-semantic-pull-request@v5
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

69
.github/workflows/tests.yml vendored Normal file
View file

@ -0,0 +1,69 @@
name: auto-tests
on:
# pull_request:
workflow_dispatch:
inputs:
commit_sha:
description: 'Specific Commit SHA to trigger on'
required: false
default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
jobs:
test-llama-stack-as-library:
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
strategy:
matrix:
provider: [fireworks, together]
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.commit_sha }}
- name: Echo commit SHA
run: |
echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
git rev-parse HEAD
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt pytest
pip install -e .
- name: Build providers
run: |
llama stack build --template ${{ matrix.provider }} --image-type venv
- name: Install the latest llama-stack-client & llama-models packages
run: |
pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
- name: Run client-sdk test
working-directory: "${{ github.workspace }}"
env:
REPORT_OUTPUT: md_report.md
shell: bash
run: |
pip install --upgrade pytest-md-report
echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
- name: Output reports to the job summary
if: always()
shell: bash
run: |
if [ -f "$REPORT_FILE" ]; then
echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "</details>" >> $GITHUB_STEP_SUMMARY
fi

View file

@ -0,0 +1,40 @@
name: Update ReadTheDocs
on:
workflow_dispatch:
inputs:
branch:
description: 'RTD version to update'
required: false
default: 'latest'
push:
branches:
- main
paths:
- 'docs/source/**'
- 'docs/resources/**'
- '.github/workflows/update-readthedocs.yml'
jobs:
update-readthedocs:
runs-on: ubuntu-latest
env:
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
steps:
- name: Trigger ReadTheDocs build
run: |
if [ -z "$TOKEN" ]; then
echo "READTHEDOCS_TOKEN is not set"
exit 1
fi
response=$(curl -X POST \
-H "Content-Type: application/json" \
-d "{\"token\": \"$TOKEN\"}" \
https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
echo "Response: $response"
if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
echo "Failed to trigger ReadTheDocs build"
exit 1
fi

1
.gitignore vendored
View file

@ -19,3 +19,4 @@ Package.resolved
_build
docs/src
pyrightconfig.json
venv/

View file

@ -5,10 +5,8 @@ default_language_version:
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: 6306a48f7dae5861702d573c9c247e4e9498e867
rev: v5.0.0 # Latest stable version
hooks:
- id: trailing-whitespace
- id: check-ast
- id: check-merge-conflict
- id: check-added-large-files
args: ['--maxkb=1000']
@ -28,23 +26,41 @@ repos:
- --license-filepath
- docs/license_header.txt
- repo: https://github.com/pycqa/flake8
rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.4
hooks:
- id: flake8
additional_dependencies:
- flake8-bugbear == 22.4.25
- pep8-naming == 0.12.1
- torchfix
args: ['--config=.flake8']
# Run the linter with import sorting.
- id: ruff
args: [
--fix,
--exit-non-zero-on-fix,
--select, I,
]
- id: ruff-format
- repo: https://github.com/omnilib/ufmt
rev: v2.7.0
- repo: https://github.com/adamchainz/blacken-docs
rev: 1.19.0
hooks:
- id: ufmt
- id: blacken-docs
additional_dependencies:
- black == 24.4.2
- usort == 1.0.8
- black==24.3.0
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.5.26
hooks:
- id: uv-export
args: ["--frozen", "--no-hashes", "--no-emit-project"]
- id: uv-sync
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: v1.14.0
# hooks:
# - id: mypy
# additional_dependencies:
# - types-requests
# - types-setuptools
# - pydantic
# args: [--ignore-missing-imports]
# - repo: https://github.com/jsh9/pydoclint
# rev: d88180a8632bb1602a4d81344085cf320f288c5a
@ -71,3 +87,7 @@ repos:
# require_serial: true
# files: ^llama_stack/templates/.*$
# stages: [manual]
ci:
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate

View file

@ -1,7 +1,8 @@
[flake8]
# Suggested config from pytorch that we can adapt
select = B,C,E,F,N,P,T4,W,B9,TOR0,TOR1,TOR2
max-line-length = 120
lint.select = ["B", "C", "E" , "F" , "N", "W", "B9"]
line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
# N812 ignored because import torch.nn.functional as F is PyTorch convention
@ -9,23 +10,28 @@ max-line-length = 120
# E731 allow usage of assigning lambda expressions
# E701 let black auto-format statements on one line
# E704 let black auto-format statements on one line
ignore =
E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,E701,E704
lint.ignore = [
"E203", "E305", "E402", "E501", "E721", "E741", "F405", "F821", "F841",
"C408", "E302", "W291", "E303", "N812", "N817", "E731", "E701",
# These are the additional ones we started ignoring after moving to ruff. We should look into each one of them later.
"C901", "C405", "C414", "N803", "N999", "C403", "C416", "B028", "C419", "C401", "B023",
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit
EXE001,
"EXE001",
# random naming hints don't need
N802,
"N802",
# these ignores are from flake8-bugbear; please fix!
B007,B008,B950
optional-ascii-coding = True
exclude =
./.git,
./docs/*,
./build,
./scripts,
./venv,
*.pyi,
.pre-commit-config.yaml,
*.md,
.flake8
"B007", "B008"
]
exclude = [
"./.git",
"./docs/*",
"./build",
"./scripts",
"./venv",
"*.pyi",
".pre-commit-config.yaml",
"*.md",
".flake8"
]

View file

@ -1,35 +0,0 @@
# Changelog
## 0.0.53
### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
- Persistence for registered objects with distribution
- Ability to persist memory banks created for FAISS
- PostgreSQL KVStore implementation
- Environment variable placeholder support in run.yaml files
- Comprehensive Zero-to-Hero notebooks and quickstart guides
- Support for quantized models in Ollama
- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
- Bedrock distribution with safety shields support
- Evals API with task registration and scoring functions
- MMLU and SimpleQA benchmark scoring functions
- Huggingface dataset provider integration for benchmarks
- Support for custom dataset registration from local paths
- Benchmark evaluation CLI tools with visualization tables
- RAG evaluation scoring functions and metrics
- Local persistence for datasets and eval tasks
### Changed
- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
- Changed provider naming convention (`impls``inline`, `adapters``remote`)
- Updated API signatures for dataset and eval task registration
- Restructured folder organization for providers
- Enhanced Docker build configuration
- Added version prefixing for REST API routes
- Enhanced evaluation task registration workflow
- Improved benchmark evaluation output formatting
- Restructured evals folder organization for better modularity
### Removed
- `llama stack configure` command

View file

@ -40,6 +40,7 @@ If you need help or guidance, comment on the issue. Issues that are extra friend
3. Ensure the test suite passes.
4. Make sure your code lints using `pre-commit`.
5. If you haven't already, complete the Contributor License Agreement ("CLA").
6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
@ -56,22 +57,50 @@ disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Set up your development environment
We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
You can install the dependencies by running:
```bash
$ cd llama-stack
$ uv sync --extra dev
$ uv pip install -e .
$ source .venv/bin/activate
```
## Pre-commit Hooks
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash
$ cd llama-stack
$ conda activate <your-environment>
$ pip install pre-commit
$ pre-commit install
$ uv run pre-commit install
```
After that, pre-commit hooks will run automatically before each commit.
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
```bash
$ uv run pre-commit run --all-files
```
> [!CAUTION]
> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
## Adding a new dependency to the project
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
```bash
$ uv add foo
$ uv sync
```
## Coding Style
* 2 spaces for indentation rather than tabs
* 4 spaces for indentation rather than tabs
* 80 character line length
* ...
@ -102,13 +131,12 @@ If you have made changes to a provider's configuration in any form (introducing
If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
```bash
cd llama-stack/docs
pip install -r requirements.txt
pip install sphinx-autobuild
$ cd llama-stack/docs
$ uv sync --extra docs
# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
make html
sphinx-autobuild source build/html
$ make html
$ uv run sphinx-autobuild source build/html
```

View file

@ -1,4 +1,4 @@
include requirements.txt
include pyproject.toml
include distributions/dependencies.json
include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh

View file

@ -2,17 +2,18 @@
[![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
[![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
Llama Stack defines and standardizes the core building blocks that simplify AI application development. It codified best practices across the Llama ecosystem. More specifically, it provides
Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack
- **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
- **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack.
<div style="text-align: center;">
<img
@ -24,31 +25,31 @@ Llama Stack defines and standardizes the core building blocks that simplify AI a
</div>
### Llama Stack Benefits
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choice.
- **Consistent Experience**: With its unified APIs Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choices.
- **Consistent Experience**: With its unified APIs, Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
- **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.
By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.
### API Providers
Here is a list of the various API providers and available distributions to developers started easily,
Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
| Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| SambaNova | Hosted | | :heavy_check_mark: | | | |
| Cerebras | Hosted | | :heavy_check_mark: | | | |
| Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | |
| AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | |
| Together | Hosted | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | |
| Groq | Hosted | | :heavy_check_mark: | | | |
| Ollama | Single Node | | :heavy_check_mark: | | | |
| TGI | Hosted and Single Node | | :heavy_check_mark: | | | |
| NVIDIA NIM | Hosted and Single Node | | :heavy_check_mark: | | | |
| Chroma | Single Node | | | :heavy_check_mark: | | |
| PG Vector | Single Node | | | :heavy_check_mark: | | |
| PyTorch ExecuTorch | On-device iOS | :heavy_check_mark: | :heavy_check_mark: | | | |
| vLLM | Hosted and Single Node | | :heavy_check_mark: | | | |
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ |
| SambaNova | Hosted | | ✅ | | | |
| Cerebras | Hosted | | ✅ | | | |
| Fireworks | Hosted | ✅ | ✅ | ✅ | | |
| AWS Bedrock | Hosted | | ✅ | | ✅ | |
| Together | Hosted | ✅ | ✅ | | ✅ | |
| Groq | Hosted | | ✅ | | | |
| Ollama | Single Node | | ✅ | | | |
| TGI | Hosted and Single Node | | ✅ | | | |
| NVIDIA NIM | Hosted and Single Node | | ✅ | | | |
| Chroma | Single Node | | | ✅ | | |
| PG Vector | Single Node | | | ✅ | | |
| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | |
| vLLM | Hosted and Single Node | | ✅ | | | |
### Distributions
@ -70,15 +71,15 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
You have two ways to install this repository:
1. **Install as a package**:
* **Install as a package**:
You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command:
```bash
pip install llama-stack
```
2. **Install from source**:
* **Install from source**:
If you prefer to install from the source code, make sure you have [conda installed](https://docs.conda.io/projects/conda/en/stable).
Then, follow these steps:
Then, run the following commands:
```bash
mkdir -p ~/local
cd ~/local
@ -95,10 +96,11 @@ You have two ways to install this repository:
Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
* [CLI reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html)
* Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
* [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
* Quick guide to start a Llama Stack server.
* CLI references
* [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
* [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
* Getting Started
* [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
* [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
* The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
* A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
@ -111,9 +113,9 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
| :----: | :----: | :----: |
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
| Typescript | [llama-stack-client-typescript](https://github.com/meta-llama/llama-stack-client-typescript) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.

View file

@ -1,9 +1,46 @@
{
"sambanova": [
"bedrock": [
"aiosqlite",
"autoevals",
"blobfile",
"boto3",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"cerebras": [
"aiosqlite",
"autoevals",
"blobfile",
"cerebras_cloud_sdk",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
@ -27,7 +64,110 @@
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"dell": [
"aiohttp",
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"fireworks": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"hf-endpoint": [
"aiohttp",
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"hf-serverless": [
"aiohttp",
@ -62,211 +202,7 @@
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"together": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"together",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"vllm-gpu": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"vllm",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"remote-vllm": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"fireworks": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"tgi": [
"aiohttp",
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"bedrock": [
"aiosqlite",
"autoevals",
"blobfile",
"boto3",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-gpu": [
"accelerate",
@ -306,39 +242,7 @@
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"nvidia": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-quantized-gpu": [
"accelerate",
@ -380,21 +284,20 @@
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"cerebras": [
"nvidia": [
"aiosqlite",
"autoevals",
"blobfile",
"cerebras_cloud_sdk",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
@ -413,7 +316,7 @@
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"ollama": [
"aiohttp",
@ -447,9 +350,72 @@
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"hf-endpoint": [
"remote-vllm": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"sambanova": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"tgi": [
"aiohttp",
"aiosqlite",
"autoevals",
@ -482,6 +448,74 @@
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"together": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"together",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"vllm-gpu": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"vllm",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
]
}

View file

@ -1,65 +0,0 @@
# Together Distribution
### Connect to a Llama Stack Together Endpoint
- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
The `llamastack/distribution-together` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | remote::together | meta-reference | meta-reference, remote::weaviate | meta-reference | meta-reference |
### Docker: Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have an hosted endpoint at Together with API Key.
```
$ cd distributions/together
$ ls
compose.yaml run.yaml
$ docker compose up
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
```
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: <optional api key>
```
### Conda llama stack run (Single Node CPU)
```bash
llama stack build --template together --image-type conda
# -- modify run.yaml to a valid Together server endpoint
llama stack run ./run.yaml
```
### (Optional) Update Model Serving Configuration
Use `llama-stack-client models list` to check the available models served by together.
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
```

View file

@ -12,3 +12,7 @@
.wy-side-nav-search {
background-color: transparent !important;
}
.hide-title h1 {
display: none;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

9
docs/conftest.py Normal file
View file

@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
def pytest_collection_modifyitems(items):
for item in items:
item.name = item.name.replace(' ', '_')

View file

@ -7,7 +7,7 @@
"id": "c1e7571c"
},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1F2ksmkoGQPa4pzRjMOE6BXWeOxWFIW6n?usp=sharing)\n",
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
"\n",
"# Llama Stack - Building AI Applications\n",
"\n",
@ -15,7 +15,7 @@
"\n",
"[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
"\n",
"Read more about the project: https://llama-stack.readthedocs.io/en/latest/index.html\n",
"Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
"\n",
"In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n"
]
@ -71,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "J2kGed0R5PSf",
"metadata": {
"colab": {
@ -81,119 +81,15 @@
"id": "J2kGed0R5PSf",
"outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" bubblewrap\n",
"0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.\n",
"Need to get 46.3 kB of archives.\n",
"After this operation, 132 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 bubblewrap amd64 0.6.1-1ubuntu0.1 [46.3 kB]\n",
"Fetched 46.3 kB in 0s (122 kB/s)\n",
"Selecting previously unselected package bubblewrap.\n",
"(Reading database ... 124561 files and directories currently installed.)\n",
"Preparing to unpack .../bubblewrap_0.6.1-1ubuntu0.1_amd64.deb ...\n",
"Unpacking bubblewrap (0.6.1-1ubuntu0.1) ...\n",
"Setting up bubblewrap (0.6.1-1ubuntu0.1) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n",
"Looking in indexes: https://test.pypi.org/simple/, https://pypi.python.org/simple\n",
"Collecting llama-stack==0.1.0rc10\n",
" Downloading https://test-files.pythonhosted.org/packages/68/22/4a170fbe01095df81e76c7bf8f35c716c1a0a5ec4503da6e78695fab351c/llama_stack-0.1.0rc10-py3-none-any.whl.metadata (15 kB)\n",
"Collecting blobfile (from llama-stack==0.1.0rc10)\n",
" Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)\n",
"Collecting fire (from llama-stack==0.1.0rc10)\n",
" Downloading fire-0.7.0.tar.gz (87 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.2/87.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (0.28.1)\n",
"Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (0.27.1)\n",
"Collecting llama-models==0.1.0rc10 (from llama-stack==0.1.0rc10)\n",
" Downloading https://test-files.pythonhosted.org/packages/45/2b/6a6947d5915054b9980f82606942f1b79960a27168299254ca12e5b5795b/llama_models-0.1.0rc10-py3-none-any.whl.metadata (8.5 kB)\n",
"Collecting llama-stack-client==0.1.0rc10 (from llama-stack==0.1.0rc10)\n",
" Downloading https://test-files.pythonhosted.org/packages/d6/85/a4fd621c4ae4db7339ab098b37bf4b4ad3cc12440e75ef10ec524e28ef7d/llama_stack_client-0.1.0rc10-py3-none-any.whl.metadata (15 kB)\n",
"Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (3.0.48)\n",
"Collecting python-dotenv (from llama-stack==0.1.0rc10)\n",
" Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
"Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.10.5)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.32.3)\n",
"Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (13.9.4)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (75.1.0)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.5.0)\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (6.0.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (3.1.5)\n",
"Collecting tiktoken (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10)\n",
" Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (11.1.0)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (3.7.1)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (8.1.8)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.9.0)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2.2.2)\n",
"Collecting pyaml (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10)\n",
" Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.3.1)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (4.67.1)\n",
"Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (4.12.2)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (2024.12.14)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (1.0.7)\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (3.10)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx->llama-stack==0.1.0rc10) (0.14.0)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack==0.1.0rc10) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack==0.1.0rc10) (2.27.2)\n",
"Collecting pycryptodomex>=3.8 (from blobfile->llama-stack==0.1.0rc10)\n",
" Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (2.3.0)\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (5.3.0)\n",
"Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (3.16.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack==0.1.0rc10) (2024.10.0)\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack==0.1.0rc10) (24.2)\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.11/dist-packages (from prompt-toolkit->llama-stack==0.1.0rc10) (0.2.13)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->llama-stack==0.1.0rc10) (3.4.1)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack==0.1.0rc10) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack==0.1.0rc10) (2.18.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack==0.1.0rc10) (0.1.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (3.0.2)\n",
"Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2024.2)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken->llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (2024.11.6)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.17.0)\n",
"Downloading https://test-files.pythonhosted.org/packages/68/22/4a170fbe01095df81e76c7bf8f35c716c1a0a5ec4503da6e78695fab351c/llama_stack-0.1.0rc10-py3-none-any.whl (532 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m532.7/532.7 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading https://test-files.pythonhosted.org/packages/45/2b/6a6947d5915054b9980f82606942f1b79960a27168299254ca12e5b5795b/llama_models-0.1.0rc10-py3-none-any.whl (1.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading https://test-files.pythonhosted.org/packages/d6/85/a4fd621c4ae4db7339ab098b37bf4b4ad3cc12440e75ef10ec524e28ef7d/llama_stack_client-0.1.0rc10-py3-none-any.whl (328 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m328.5/328.5 kB\u001b[0m \u001b[31m29.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading blobfile-3.0.0-py3-none-any.whl (75 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
"Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m57.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)\n",
"Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m64.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hBuilding wheels for collected packages: fire\n",
" Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=3a37285ecae37a5fb69bbad717aabdb8c13f0da7906668b7c123475eefa41c3b\n",
" Stored in directory: /root/.cache/pip/wheels/46/54/24/1624fd5b8674eb1188623f7e8e17cdf7c0f6c24b609dfb8a89\n",
"Successfully built fire\n",
"Installing collected packages: python-dotenv, pycryptodomex, pyaml, fire, tiktoken, blobfile, llama-stack-client, llama-models, llama-stack\n",
"Successfully installed blobfile-3.0.0 fire-0.7.0 llama-models-0.1.0rc10 llama-stack-0.1.0rc10 llama-stack-client-0.1.0rc10 pyaml-25.1.0 pycryptodomex-3.21.0 python-dotenv-1.0.1 tiktoken-0.8.0\n"
]
}
],
"outputs": [],
"source": [
"# NBVAL_SKIP\n",
"\n",
"!apt-get install -y bubblewrap\n",
"# install a branch of llama stack\n",
"!pip install llama-stack"
"import os\n",
"os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n",
"!pip install uv\n",
"!uv pip install llama-stack"
]
},
{
@ -218,7 +114,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "HaepEZXCDgif",
"metadata": {
"colab": {
@ -228,331 +124,9 @@
"id": "HaepEZXCDgif",
"outputId": "9314f698-593d-4c1a-ea15-15c735dc1023"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: llama-stack in /usr/local/lib/python3.11/dist-packages (0.1.0rc10)\r\n",
"Requirement already satisfied: blobfile in /usr/local/lib/python3.11/dist-packages (from llama-stack) (3.0.0)\r\n",
"Requirement already satisfied: fire in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.7.0)\r\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.28.1)\r\n",
"Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.27.1)\r\n",
"Requirement already satisfied: llama-models==0.1.0rc10 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.1.0rc10)\r\n",
"Requirement already satisfied: llama-stack-client==0.1.0rc10 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.1.0rc10)\r\n",
"Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.11/dist-packages (from llama-stack) (3.0.48)\r\n",
"Requirement already satisfied: python-dotenv in /usr/local/lib/python3.11/dist-packages (from llama-stack) (1.0.1)\r\n",
"Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.10.5)\r\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.32.3)\r\n",
"Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from llama-stack) (13.9.4)\r\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from llama-stack) (75.1.0)\r\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.5.0)\r\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (6.0.2)\r\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (3.1.5)\r\n",
"Requirement already satisfied: tiktoken in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (0.8.0)\r\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (11.1.0)\r\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (3.7.1)\r\n",
"Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (8.1.8)\r\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (1.9.0)\r\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (2.2.2)\r\n",
"Requirement already satisfied: pyaml in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (25.1.0)\r\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (1.3.1)\r\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (4.67.1)\r\n",
"Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (4.12.2)\r\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (2024.12.14)\r\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (1.0.7)\r\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (3.10)\r\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx->llama-stack) (0.14.0)\r\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack) (0.7.0)\r\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack) (2.27.2)\r\n",
"Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (3.21.0)\r\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (2.3.0)\r\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (5.3.0)\r\n",
"Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (3.16.1)\r\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack) (2024.10.0)\r\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack) (24.2)\r\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.11/dist-packages (from prompt-toolkit->llama-stack) (0.2.13)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->llama-stack) (3.4.1)\r\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack) (3.0.0)\r\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack) (2.18.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack) (0.1.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->llama-models==0.1.0rc10->llama-stack) (3.0.2)\n",
"Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2024.2)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken->llama-models==0.1.0rc10->llama-stack) (2024.11.6)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client==0.1.0rc10->llama-stack) (1.17.0)\n",
"Installing pip dependencies\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)\n",
"Collecting together\n",
" Downloading together-1.3.11-py3-none-any.whl.metadata (11 kB)\n",
"Collecting datasets\n",
" Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)\n",
"Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.47.1)\n",
"Requirement already satisfied: blobfile in /usr/local/lib/python3.11/dist-packages (3.0.0)\n",
"Requirement already satisfied: opentelemetry-sdk in /usr/local/lib/python3.11/dist-packages (1.29.0)\n",
"Collecting redis\n",
" Downloading redis-5.2.1-py3-none-any.whl.metadata (9.1 kB)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (2.32.3)\n",
"Requirement already satisfied: chardet in /usr/local/lib/python3.11/dist-packages (5.2.0)\n",
"Collecting chromadb-client\n",
" Downloading chromadb_client-0.6.3-py3-none-any.whl.metadata (2.4 kB)\n",
"Collecting psycopg2-binary\n",
" Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
"Collecting mcp\n",
" Downloading mcp-1.2.0-py3-none-any.whl.metadata (15 kB)\n",
"Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (11.1.0)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (1.13.1)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (4.67.1)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.11/dist-packages (0.2.0)\n",
"Collecting faiss-cpu\n",
" Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)\n",
"Collecting opentelemetry-exporter-otlp-proto-http\n",
" Downloading opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl.metadata (2.2 kB)\n",
"Collecting autoevals\n",
" Downloading autoevals-0.0.117-py3-none-any.whl.metadata (12 kB)\n",
"Collecting pypdf\n",
" Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)\n",
"Collecting aiosqlite\n",
" Downloading aiosqlite-0.20.0-py3-none-any.whl.metadata (4.3 kB)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (1.26.4)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.6.0)\n",
"Requirement already satisfied: openai in /usr/local/lib/python3.11/dist-packages (1.59.6)\n",
"Collecting fastapi\n",
" Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n",
"Requirement already satisfied: fire in /usr/local/lib/python3.11/dist-packages (0.7.0)\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (0.28.1)\n",
"Collecting uvicorn\n",
" Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: aiohttp<4.0.0,>=3.9.3 in /usr/local/lib/python3.11/dist-packages (from together) (3.11.11)\n",
"Requirement already satisfied: click<9.0.0,>=8.1.7 in /usr/local/lib/python3.11/dist-packages (from together) (8.1.8)\n",
"Requirement already satisfied: eval-type-backport<0.3.0,>=0.1.3 in /usr/local/lib/python3.11/dist-packages (from together) (0.2.2)\n",
"Requirement already satisfied: filelock<4.0.0,>=3.13.1 in /usr/local/lib/python3.11/dist-packages (from together) (3.16.1)\n",
"Collecting pillow\n",
" Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)\n",
"Requirement already satisfied: pyarrow>=10.0.1 in /usr/local/lib/python3.11/dist-packages (from together) (17.0.0)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.6.3 in /usr/local/lib/python3.11/dist-packages (from together) (2.10.5)\n",
"Requirement already satisfied: rich<14.0.0,>=13.8.1 in /usr/local/lib/python3.11/dist-packages (from together) (13.9.4)\n",
"Requirement already satisfied: tabulate<0.10.0,>=0.9.0 in /usr/local/lib/python3.11/dist-packages (from together) (0.9.0)\n",
"Requirement already satisfied: typer<0.16,>=0.9 in /usr/local/lib/python3.11/dist-packages (from together) (0.15.1)\n",
"Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
" Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
"Collecting xxhash (from datasets)\n",
" Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
"Collecting multiprocess<0.70.17 (from datasets)\n",
" Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)\n",
"Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
" Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
"Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.27.1)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n",
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.0)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.2)\n",
"Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.11/dist-packages (from blobfile) (3.21.0)\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile) (2.3.0)\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile) (5.3.0)\n",
"Requirement already satisfied: opentelemetry-api==1.29.0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (1.29.0)\n",
"Requirement already satisfied: opentelemetry-semantic-conventions==0.50b0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (0.50b0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (4.12.2)\n",
"Requirement already satisfied: deprecated>=1.2.6 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-api==1.29.0->opentelemetry-sdk) (1.2.15)\n",
"Requirement already satisfied: importlib-metadata<=8.5.0,>=6.0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-api==1.29.0->opentelemetry-sdk) (8.5.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.55.3)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests) (3.4.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests) (3.10)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests) (2024.12.14)\n",
"Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb-client)\n",
" Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl.metadata (2.2 kB)\n",
"Collecting overrides>=7.3.1 (from chromadb-client)\n",
" Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)\n",
"Collecting posthog>=2.4.0 (from chromadb-client)\n",
" Downloading posthog-3.8.4-py2.py3-none-any.whl.metadata (2.8 kB)\n",
"Requirement already satisfied: tenacity>=8.2.3 in /usr/local/lib/python3.11/dist-packages (from chromadb-client) (9.0.0)\n",
"Requirement already satisfied: orjson>=3.9.12 in /usr/local/lib/python3.11/dist-packages (from chromadb-client) (3.10.14)\n",
"Collecting anyio>=4.5 (from mcp)\n",
" Downloading anyio-4.8.0-py3-none-any.whl.metadata (4.6 kB)\n",
"Collecting httpx-sse>=0.4 (from mcp)\n",
" Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n",
"Collecting pydantic-settings>=2.6.1 (from mcp)\n",
" Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)\n",
"Collecting sse-starlette>=1.6.1 (from mcp)\n",
" Downloading sse_starlette-2.2.1-py3-none-any.whl.metadata (7.8 kB)\n",
"Collecting starlette>=0.27 (from mcp)\n",
" Downloading starlette-0.45.2-py3-none-any.whl.metadata (6.3 kB)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk) (1.4.2)\n",
"Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-exporter-otlp-proto-http) (1.66.0)\n",
"Collecting opentelemetry-exporter-otlp-proto-common==1.29.0 (from opentelemetry-exporter-otlp-proto-http)\n",
" Downloading opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl.metadata (1.8 kB)\n",
"Collecting opentelemetry-proto==1.29.0 (from opentelemetry-exporter-otlp-proto-http)\n",
" Downloading opentelemetry_proto-1.29.0-py3-none-any.whl.metadata (2.3 kB)\n",
"Collecting protobuf<6.0,>=5.0 (from opentelemetry-proto==1.29.0->opentelemetry-exporter-otlp-proto-http)\n",
" Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)\n",
"Collecting chevron (from autoevals)\n",
" Downloading chevron-0.14.0-py3-none-any.whl.metadata (4.9 kB)\n",
"Collecting levenshtein (from autoevals)\n",
" Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)\n",
"Collecting braintrust_core==0.0.58 (from autoevals)\n",
" Downloading braintrust_core-0.0.58-py3-none-any.whl.metadata (669 bytes)\n",
"Requirement already satisfied: jsonschema in /usr/local/lib/python3.11/dist-packages (from autoevals) (4.23.0)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.5.0)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from openai) (1.9.0)\n",
"Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from openai) (0.8.2)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai) (1.3.1)\n",
"Collecting starlette>=0.27 (from mcp)\n",
" Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from fire) (2.5.0)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx) (1.0.7)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx) (0.14.0)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (2.4.4)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.3.2)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (24.3.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (6.1.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (0.2.1)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.18.3)\n",
"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.11/dist-packages (from deprecated>=1.2.6->opentelemetry-api==1.29.0->opentelemetry-sdk) (1.17.0)\n",
"Requirement already satisfied: grpcio<2.0.0,>=1.63.2 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb-client) (1.69.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from posthog>=2.4.0->chromadb-client) (1.17.0)\n",
"Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb-client)\n",
" Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)\n",
"Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb-client)\n",
" Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.6.3->together) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.6.3->together) (2.27.2)\n",
"Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from pydantic-settings>=2.6.1->mcp) (1.0.1)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.8.1->together) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.8.1->together) (2.18.0)\n",
"Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<0.16,>=0.9->together) (1.5.4)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (2024.10.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (0.35.1)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (0.22.3)\n",
"Collecting rapidfuzz<4.0.0,>=3.9.0 (from levenshtein->autoevals)\n",
" Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
"Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.11/dist-packages (from importlib-metadata<=8.5.0,>=6.0->opentelemetry-api==1.29.0->opentelemetry-sdk) (3.21.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.8.1->together) (0.1.2)\n",
"Downloading together-1.3.11-py3-none-any.whl (70 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.6/70.6 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading redis-5.2.1-py3-none-any.whl (261 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.5/261.5 kB\u001b[0m \u001b[31m25.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading chromadb_client-0.6.3-py3-none-any.whl (609 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m609.2/609.2 kB\u001b[0m \u001b[31m38.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m100.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading mcp-1.2.0-py3-none-any.whl (66 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.5/66.5 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m106.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.5/27.5 MB\u001b[0m \u001b[31m78.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl (17 kB)\n",
"Downloading opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl (18 kB)\n",
"Downloading opentelemetry_proto-1.29.0-py3-none-any.whl (55 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading autoevals-0.0.117-py3-none-any.whl (41 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.4/41.4 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading braintrust_core-0.0.58-py3-none-any.whl (4.4 kB)\n",
"Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading aiosqlite-0.20.0-py3-none-any.whl (15 kB)\n",
"Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading anyio-4.8.0-py3-none-any.whl (96 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.0/96.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n",
"Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.5/143.5 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl (18 kB)\n",
"Downloading overrides-7.7.0-py3-none-any.whl (17 kB)\n",
"Downloading posthog-3.8.4-py2.py3-none-any.whl (69 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.8/69.8 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pydantic_settings-2.7.1-py3-none-any.whl (29 kB)\n",
"Downloading sse_starlette-2.2.1-py3-none-any.whl (10 kB)\n",
"Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading chevron-0.14.0-py3-none-any.whl (11 kB)\n",
"Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m162.7/162.7 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.8/194.8 kB\u001b[0m \u001b[31m21.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
"Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
"Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m84.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: monotonic, chevron, xxhash, uvicorn, redis, rapidfuzz, pypdf, psycopg2-binary, protobuf, pillow, overrides, httpx-sse, fsspec, faiss-cpu, dill, braintrust_core, backoff, anyio, aiosqlite, starlette, posthog, opentelemetry-proto, multiprocess, levenshtein, sse-starlette, pydantic-settings, opentelemetry-exporter-otlp-proto-common, fastapi, together, mcp, datasets, autoevals, opentelemetry-exporter-otlp-proto-http, opentelemetry-exporter-otlp-proto-grpc, chromadb-client\n",
" Attempting uninstall: protobuf\n",
" Found existing installation: protobuf 4.25.5\n",
" Uninstalling protobuf-4.25.5:\n",
" Successfully uninstalled protobuf-4.25.5\n",
" Attempting uninstall: pillow\n",
" Found existing installation: pillow 11.1.0\n",
" Uninstalling pillow-11.1.0:\n",
" Successfully uninstalled pillow-11.1.0\n",
" Attempting uninstall: fsspec\n",
" Found existing installation: fsspec 2024.10.0\n",
" Uninstalling fsspec-2024.10.0:\n",
" Successfully uninstalled fsspec-2024.10.0\n",
" Attempting uninstall: anyio\n",
" Found existing installation: anyio 3.7.1\n",
" Uninstalling anyio-3.7.1:\n",
" Successfully uninstalled anyio-3.7.1\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"jupyter-server 1.24.0 requires anyio<4,>=3.1.0, but you have anyio 4.8.0 which is incompatible.\n",
"gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
"tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.3 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed aiosqlite-0.20.0 anyio-4.8.0 autoevals-0.0.117 backoff-2.2.1 braintrust_core-0.0.58 chevron-0.14.0 chromadb-client-0.6.3 datasets-3.2.0 dill-0.3.8 faiss-cpu-1.9.0.post1 fastapi-0.115.6 fsspec-2024.9.0 httpx-sse-0.4.0 levenshtein-0.26.1 mcp-1.2.0 monotonic-1.6 multiprocess-0.70.16 opentelemetry-exporter-otlp-proto-common-1.29.0 opentelemetry-exporter-otlp-proto-grpc-1.29.0 opentelemetry-exporter-otlp-proto-http-1.29.0 opentelemetry-proto-1.29.0 overrides-7.7.0 pillow-10.4.0 posthog-3.8.4 protobuf-5.29.3 psycopg2-binary-2.9.10 pydantic-settings-2.7.1 pypdf-5.1.0 rapidfuzz-3.11.0 redis-5.2.1 sse-starlette-2.2.1 starlette-0.41.3 together-1.3.11 uvicorn-0.34.0 xxhash-3.5.0\n",
"torch --index-url https://download.pytorch.org/whl/cpu\n",
"Looking in indexes: https://download.pytorch.org/whl/cpu\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.5.1+cu121)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.16.1)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.12.2)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.5)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2024.9.0)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.3.1)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.11/dist-packages (from torch) (11.0.2.54)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.11/dist-packages (from torch) (10.3.2.106)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.11/dist-packages (from torch) (11.4.5.107)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.0.106)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.0)\n",
"Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.11/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.6.85)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n",
"sentence-transformers --no-deps\n",
"Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.11/dist-packages (3.3.1)\n",
"\u001b[32mBuild Successful!\u001b[0m\n"
]
}
],
"outputs": [],
"source": [
"# NBVAL_SKIP\n",
"\n",
"# This will build all the dependencies you will need\n",
"!llama stack build --template together --image-type venv"
]
@ -571,7 +145,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "E1UFuJC570Tk",
"metadata": {
"colab": {
@ -1125,11 +699,8 @@
" if not api_key:\n",
" raise ValueError(f\"{key} environment variable is empty\")\n",
" except KeyError:\n",
" raise KeyError(\n",
" f\"{key} environment variable is not set. \"\n",
" \"Please set your API key using in userdata (if using google colab notebook)\"\n",
" f\"or using `export {key}='your-api-key-here'`\"\n",
" ) from None\n",
" api_key = input(f\"{key} environment variable is not set. Please enter your API key: \")\n",
" os.environ[key] = api_key\n",
"\n",
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
"client = LlamaStackAsLibraryClient(\"together\", provider_data = {\"tavily_search_api_key\": os.environ['TAVILY_SEARCH_API_KEY']})\n",
@ -1150,7 +721,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "ruO9jQna_t_S",
"metadata": {
"colab": {
@ -1211,7 +782,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"id": "LINBvv8lwTJh",
"metadata": {
"colab": {
@ -1228,7 +799,7 @@
"'meta-llama/Llama-3.1-70B-Instruct'"
]
},
"execution_count": 4,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -1253,7 +824,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"id": "77c29dba",
"metadata": {
"colab": {
@ -1267,7 +838,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Here's a two-sentence poem about a llama:\n",
"Here is a two-sentence poem about a llama:\n",
"\n",
"With gentle eyes and a soft, fuzzy face,\n",
"The llama roams, a peaceful, gentle pace.\n"
@ -2084,13 +1655,14 @@
}
],
"source": [
"import uuid\n",
"from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n",
"from termcolor import cprint\n",
"from llama_stack_client.types import Document\n",
"\n",
"urls = [\"chat.rst\", \"llama3.rst\", \"datasets.rst\", \"lora_finetune.rst\"]\n",
"urls = [\"chat.rst\", \"llama3.rst\", \"memory_optimizations.rst\", \"lora_finetune.rst\"]\n",
"documents = [\n",
" Document(\n",
" document_id=f\"num-{i}\",\n",
@ -2101,7 +1673,7 @@
" for i, url in enumerate(urls)\n",
"]\n",
"\n",
"vector_db_id = \"test-vector-db\"\n",
"vector_db_id = f\"test-vector-db-{uuid.uuid4().hex}\"\n",
"client.vector_dbs.register(\n",
" vector_db_id=vector_db_id,\n",
" embedding_model=\"all-MiniLM-L6-v2\",\n",
@ -2398,6 +1970,7 @@
}
],
"source": [
"# NBVAL_SKIP\n",
"!pip install colab-xterm #https://pypi.org/project/colab-xterm/\n",
"%load_ext colabxterm"
]
@ -2774,7 +2347,7 @@
}
],
"source": [
"\n",
"# NBVAL_SKIP\n",
"%xterm\n",
"# touch /content/foo\n",
"# touch /content/bar\n",
@ -2800,6 +2373,7 @@
},
"outputs": [],
"source": [
"# NBVAL_SKIP\n",
"from llama_stack_client.types.shared_params.url import URL\n",
"client.toolgroups.register(\n",
" toolgroup_id=\"mcp::filesystem\",\n",
@ -3170,6 +2744,7 @@
}
],
"source": [
"# NBVAL_SKIP\n",
"from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n",
@ -3523,7 +3098,7 @@
}
],
"source": [
"# NBVAL_SKIP \n",
"# NBVAL_SKIP\n",
"print(f\"Getting traces for session_id={session_id}\")\n",
"import json\n",
"\n",
@ -3821,6 +3396,231 @@
"response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n",
"pprint(response)\n"
]
},
{
"cell_type": "markdown",
"id": "ad077440",
"metadata": {},
"source": [
"## 4. Image Understanding with Llama 3.2\n",
"\n",
"Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
]
},
{
"cell_type": "markdown",
"id": "82e381ec",
"metadata": {},
"source": [
"### 4.1 Setup and helpers\n",
"\n",
"Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "865fc5a8",
"metadata": {},
"outputs": [],
"source": [
"!pip install llama-stack-client==0.1.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44e05e16",
"metadata": {},
"outputs": [],
"source": [
"!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "469750f7",
"metadata": {},
"outputs": [],
"source": [
"from PIL import Image\n",
"import matplotlib.pyplot as plt\n",
"\n",
"def display_image(path):\n",
" img = Image.open(path)\n",
" plt.imshow(img)\n",
" plt.axis('off')\n",
" plt.show()\n",
"\n",
"display_image(\"Llama_Repo.jpeg\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2c1e1c2",
"metadata": {},
"outputs": [],
"source": [
"import base64\n",
"\n",
"def encode_image(image_path):\n",
" with open(image_path, \"rb\") as image_file:\n",
" base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
" base64_url = f\"data:image/png;base64,{base64_string}\"\n",
" return base64_url"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c565f99e",
"metadata": {},
"outputs": [],
"source": [
"from llama_stack_client import LlamaStackClient\n",
"\n",
"LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
"LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
]
},
{
"cell_type": "markdown",
"id": "7737cd41",
"metadata": {},
"source": [
"### 4.2 Using Llama Stack Chat API\n",
"\n",
"The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7914894",
"metadata": {},
"outputs": [],
"source": [
"from llama_stack_client.lib.inference.event_logger import EventLogger\n",
"\n",
"async def run_main(image_path: str, prompt):\n",
" client = LlamaStackClient(\n",
" base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
" )\n",
"\n",
" message = {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"image\": {\n",
" \"url\": {\n",
" \"uri\": encode_image(image_path)\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": prompt,\n",
" }\n",
" ]\n",
" }\n",
"\n",
" response = client.inference.chat_completion(\n",
" messages=[message],\n",
" model_id=LLAMA32_11B_INSTRUCT,\n",
" stream=False,\n",
" )\n",
"\n",
" print(response.completion_message.content.lower().strip())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ee09b97",
"metadata": {},
"outputs": [],
"source": [
"await run_main(\"Llama_Repo.jpeg\",\n",
" \"How many different colors are those llamas?\\\n",
" What are those colors?\")"
]
},
{
"cell_type": "markdown",
"id": "e741d7b9",
"metadata": {},
"source": [
"### 4.3 Using Llama Stack Agent API\n",
"\n",
"The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9a83275",
"metadata": {},
"outputs": [],
"source": [
"from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n",
"\n",
"async def run_main(image_path, prompt):\n",
" base64_image = encode_image(image_path)\n",
"\n",
" client = LlamaStackClient(\n",
" base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
" )\n",
"\n",
" agent_config = AgentConfig(\n",
" model=LLAMA32_11B_INSTRUCT,\n",
" instructions=\"You are a helpful assistant\",\n",
" enable_session_persistence=False,\n",
" )\n",
"\n",
" agent = Agent(client, agent_config)\n",
" session_id = agent.create_session(\"test-session\")\n",
"\n",
" response = agent.create_turn(\n",
" messages=[{\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"image\": {\n",
" \"url\": {\n",
" \"uri\": encode_image(image_path)\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": prompt,\n",
" }\n",
" ]\n",
" }],\n",
" session_id=session_id,\n",
" )\n",
"\n",
" for log in EventLogger().log(response):\n",
" log.print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15d0098b",
"metadata": {},
"outputs": [],
"source": [
"await run_main(\"Llama_Repo.jpeg\",\n",
" \"How many different colors are those llamas?\\\n",
" What are those colors?\")"
]
}
],
"metadata": {
@ -3830,7 +3630,8 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"display_name": "toolchain",
"language": "python",
"name": "python3"
},
"language_info": {

View file

@ -6,7 +6,7 @@
"id": "hTIfyoGtjoWD"
},
"source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UvR9m2KTinvlDXeOWfS2HBU4X72LAjTz?usp=sharing)\n",
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)\n",
"\n",
"# Llama Stack Benchmark Evals\n",
"\n",
@ -1383,7 +1383,8 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"display_name": "master",
"language": "python",
"name": "python3"
},
"language_info": {

View file

@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification # noqa: E402
def str_presenter(dumper, data):
if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
"#/components/schemas/"
):
style = None
else:
style = ">" if "\n" in data or len(data) > 40 else None
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
def main(output_dir: str):
output_dir = Path(output_dir)
if not output_dir.exists():
@ -69,7 +79,8 @@ def main(output_dir: str):
y.sequence_dash_offset = 2
y.width = 80
y.allow_unicode = True
y.explicit_start = True
y.representer.add_representer(str, str_presenter)
y.dump(
spec.get_json(),
fp,

View file

@ -4,10 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import collections
import hashlib
import ipaddress
import typing
from dataclasses import make_dataclass
from typing import Any, Dict, Set, Union
from ..strong_typing.core import JsonType
@ -177,20 +177,37 @@ class ContentBuilder:
) -> Dict[str, MediaType]:
"Creates the content subtree for a request or response."
def has_iterator_type(t):
if typing.get_origin(t) is typing.Union:
return any(has_iterator_type(a) for a in typing.get_args(t))
def is_iterator_type(t):
return "StreamChunk" in str(t)
def get_media_type(t):
if is_generic_list(t):
return "application/jsonl"
elif is_iterator_type(t):
return "text/event-stream"
else:
# TODO: needs a proper fix where we let all types correctly flow upwards
# and then test against AsyncIterator
return "StreamChunk" in str(t)
return "application/json"
if typing.get_origin(payload_type) is typing.Union:
media_types = []
item_types = []
for x in typing.get_args(payload_type):
media_types.append(get_media_type(x))
item_types.append(x)
if len(set(media_types)) == 1:
# all types have the same media type
return {media_types[0]: self.build_media_type(payload_type, examples)}
else:
# different types have different media types
return {
media_type: self.build_media_type(item_type, examples)
for media_type, item_type in zip(media_types, item_types)
}
if is_generic_list(payload_type):
media_type = "application/jsonl"
item_type = unwrap_generic_list(payload_type)
elif has_iterator_type(payload_type):
item_type = payload_type
media_type = "text/event-stream"
else:
media_type = "application/json"
item_type = payload_type
@ -233,7 +250,9 @@ class ContentBuilder:
value = sample_transformer(object_to_json(example))
hash_string = (
hashlib.md5(json_dump_string(value).encode("utf-8")).digest().hex()
hashlib.sha256(json_dump_string(value).encode("utf-8"))
.digest()
.hex()[:16]
)
name = f"ex-{hash_string}"
@ -276,6 +295,20 @@ class StatusResponse:
examples: List[Any] = dataclasses.field(default_factory=list)
def create_docstring_for_request(
request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
) -> str:
"""Creates a ReST-style docstring for a dynamically generated request dataclass."""
lines = ["\n"] # Short description
# Add parameter documentation in ReST format
for name, type_ in fields:
desc = doc_params.get(name, "")
lines.append(f":param {name}: {desc}")
return "\n".join(lines)
class ResponseBuilder:
content_builder: ContentBuilder
@ -493,11 +526,24 @@ class Generator:
first = next(iter(op.request_params))
request_name, request_type = first
from dataclasses import make_dataclass
op_name = "".join(word.capitalize() for word in op.name.split("_"))
request_name = f"{op_name}Request"
request_type = make_dataclass(request_name, op.request_params)
fields = [
(
name,
type_,
)
for name, type_ in op.request_params
]
request_type = make_dataclass(
request_name,
fields,
namespace={
"__doc__": create_docstring_for_request(
request_name, fields, doc_params
)
},
)
requestBody = RequestBody(
content={
@ -598,10 +644,14 @@ class Generator:
else:
callbacks = None
description = "\n".join(
filter(None, [doc_string.short_description, doc_string.long_description])
)
return Operation(
tags=[op.defining_class.__name__],
summary=doc_string.short_description,
description=doc_string.long_description,
summary=None,
# summary=doc_string.short_description,
description=description,
parameters=parameters,
requestBody=requestBody,
responses=responses,
@ -633,6 +683,7 @@ class Generator:
raise NotImplementedError(f"unknown HTTP method: {op.http_method}")
route = op.get_route()
route = route.replace(":path", "")
print(f"route: {route}")
if route in paths:
paths[route].update(pathItem)
@ -650,12 +701,6 @@ class Generator:
)
)
# types that are produced/consumed by operations
type_tags = [
self._build_type_tag(ref, schema)
for ref, schema in self.schema_builder.schemas.items()
]
# types that are emitted by events
event_tags: List[Tag] = []
events = get_endpoint_events(self.endpoint)
@ -682,7 +727,6 @@ class Generator:
# list all operations and types
tags: List[Tag] = []
tags.extend(operation_tags)
tags.extend(type_tags)
tags.extend(event_tags)
for extra_tag_group in extra_tag_groups.values():
tags.extend(extra_tag_group)
@ -697,13 +741,6 @@ class Generator:
tags=sorted(tag.name for tag in operation_tags),
)
)
if type_tags:
tag_groups.append(
TagGroup(
name=self.options.map("Types"),
tags=sorted(tag.name for tag in type_tags),
)
)
if event_tags:
tag_groups.append(
TagGroup(

View file

@ -130,6 +130,8 @@ class _FormatParameterExtractor:
def _get_route_parameters(route: str) -> List[str]:
extractor = _FormatParameterExtractor()
# Replace all occurrences of ":path" with empty string
route = route.replace(":path", "")
route.format_map(extractor)
return extractor.keys

View file

@ -6,36 +6,36 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>OpenAPI specification</title>
<link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
<script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
<link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
<style>
body {
margin: 0;
padding: 0;
height: 100vh;
}
elements-api {
height: 100%;
}
</style>
<script defer="defer" src="https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js"></script>
<script defer="defer">
</head>
<body>
<elements-api id="openapi-container" router="hash" layout="sidebar" hideExport="true"
hideInternal="true"></elements-api>
<script>
document.addEventListener("DOMContentLoaded", function () {
spec = { /* OPENAPI_SPECIFICATION */ };
options = {
downloadFileName: "openapi.json",
expandResponses: "200",
expandSingleSchemaField: true,
jsonSampleExpandLevel: "all",
schemaExpansionLevel: "all",
};
element = document.getElementById("openapi-container");
Redoc.init(spec, options, element);
const spec = { /* OPENAPI_SPECIFICATION */ };
const element = document.getElementById("openapi-container");
element.apiDescriptionDocument = spec;
if (spec.info && spec.info.title) {
document.title = spec.info.title;
}
});
</script>
</head>
<body>
<div id="openapi-container"></div>
</body>
</html>

View file

@ -29,4 +29,5 @@ fi
stack_dir=$(dirname $(dirname $THIS_DIR))
models_dir=$(dirname $stack_dir)/llama-models
PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/resources
PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir \
python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static

View file

@ -109,10 +109,10 @@ def get_class_property_docstrings(
def docstring_to_schema(data_type: type) -> Schema:
short_description, long_description = get_class_docstrings(data_type)
schema: Schema = {}
if short_description:
schema["title"] = short_description
if long_description:
schema["description"] = long_description
description = "\n".join(filter(None, [short_description, long_description]))
if description:
schema["description"] = description
return schema
@ -248,7 +248,9 @@ class JsonSchemaGenerator:
type_schema.update(self._metadata_to_schema(m))
return type_schema
def _simple_type_to_schema(self, typ: TypeLike) -> Optional[Schema]:
def _simple_type_to_schema(
self, typ: TypeLike, json_schema_extra: Optional[dict] = None
) -> Optional[Schema]:
"""
Returns the JSON schema associated with a simple, unrestricted type.
@ -264,6 +266,11 @@ class JsonSchemaGenerator:
elif typ is float:
return {"type": "number"}
elif typ is str:
if json_schema_extra and "contentEncoding" in json_schema_extra:
return {
"type": "string",
"contentEncoding": json_schema_extra["contentEncoding"],
}
return {"type": "string"}
elif typ is bytes:
return {"type": "string", "contentEncoding": "base64"}
@ -303,7 +310,12 @@ class JsonSchemaGenerator:
# not a simple type
return None
def type_to_schema(self, data_type: TypeLike, force_expand: bool = False) -> Schema:
def type_to_schema(
self,
data_type: TypeLike,
force_expand: bool = False,
json_schema_extra: Optional[dict] = None,
) -> Schema:
"""
Returns the JSON schema associated with a type.
@ -313,7 +325,7 @@ class JsonSchemaGenerator:
"""
# short-circuit for common simple types
schema = self._simple_type_to_schema(data_type)
schema = self._simple_type_to_schema(data_type, json_schema_extra)
if schema is not None:
return schema
@ -486,15 +498,9 @@ class JsonSchemaGenerator:
property_docstrings = get_class_property_docstrings(
typ, self.options.property_description_fun
)
properties: Dict[str, Schema] = {}
required: List[str] = []
for property_name, property_type in get_class_properties(typ):
defaults = {}
if "model_fields" in members:
f = members["model_fields"]
defaults = {k: finfo.default for k, finfo in f.items()}
# rename property if an alias name is specified
alias = get_annotation(property_type, Alias)
if alias:
@ -502,11 +508,22 @@ class JsonSchemaGenerator:
else:
output_name = property_name
defaults = {}
json_schema_extra = None
if "model_fields" in members:
f = members["model_fields"]
defaults = {k: finfo.default for k, finfo in f.items()}
json_schema_extra = f.get(output_name, None).json_schema_extra
if is_type_optional(property_type):
optional_type: type = unwrap_optional_type(property_type)
property_def = self.type_to_schema(optional_type)
property_def = self.type_to_schema(
optional_type, json_schema_extra=json_schema_extra
)
else:
property_def = self.type_to_schema(property_type)
property_def = self.type_to_schema(
property_type, json_schema_extra=json_schema_extra
)
required.append(output_name)
# check if attribute has a default value initializer
@ -531,6 +548,7 @@ class JsonSchemaGenerator:
# add property docstring if available
property_doc = property_docstrings.get(property_name)
if property_doc:
# print(output_name, property_doc)
property_def.pop("title", None)
property_def["description"] = property_doc

View file

@ -6,6 +6,6 @@ Here's a collection of comprehensive guides, examples, and resources for buildin
Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
* [Building AI Applications Notebook](./notebooks/Llama_Stack_Building_AI_Applications.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
* [Building AI Applications Notebook](./getting_started.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
* [Benchmark Evaluations Notebook](./notebooks/Llama_Stack_Benchmark_Evals.ipynb) - Detailed performance evaluations and benchmarking results
* [Zero-to-Hero Guide](./notebooks/Llama_Stack_Zero_to_Hero_Guide.ipynb) - Step-by-step guide for getting started with Llama Stack
* [Zero-to-Hero Guide](./zero_to_hero_guide) - Step-by-step guide for getting started with Llama Stack

View file

@ -77,7 +77,7 @@ agent_config = AgentConfig(
instructions="You are a helpful assistant",
# Enable both RAG and tool usage
toolgroups=[
{"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}}.
{"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}},
"builtin::code_interpreter",
],
# Configure safety
@ -86,13 +86,9 @@ agent_config = AgentConfig(
# Control the inference loop
max_infer_iters=5,
sampling_params={
"strategy": {
"type": "top_p",
"temperature": 0.7,
"top_p": 0.95
},
"max_tokens": 2048
}
"strategy": {"type": "top_p", "temperature": 0.7, "top_p": 0.95},
"max_tokens": 2048,
},
)
agent = Agent(client, agent_config)
@ -101,11 +97,13 @@ session_id = agent.create_session("monitored_session")
# Stream the agent's execution steps
response = agent.create_turn(
messages=[{"role": "user", "content": "Analyze this code and run it"}],
attachments=[{
"content": "https://raw.githubusercontent.com/example/code.py",
"mime_type": "text/plain"
}],
session_id=session_id
attachments=[
{
"content": "https://raw.githubusercontent.com/example/code.py",
"mime_type": "text/plain",
}
],
session_id=session_id,
)
# Monitor each step of execution

View file

@ -15,6 +15,7 @@ This first example walks you through how to evaluate a model candidate served by
```python
import datasets
ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev")
ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"])
eval_rows = ds.to_pandas().to_dict(orient="records")
@ -43,7 +44,7 @@ system_message = {
client.eval_tasks.register(
eval_task_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"]
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
response = client.eval.evaluate_rows(
@ -62,9 +63,9 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096,
"repeat_penalty": 1.0,
},
"system_message": system_message
}
}
"system_message": system_message,
},
},
)
```
@ -88,7 +89,7 @@ _ = client.datasets.register(
"input_query": {"type": "string"},
"expected_answer": {"type": "string"},
"chat_completion_input": {"type": "chat_completion_input"},
}
},
)
eval_rows = client.datasetio.get_rows_paginated(
@ -101,7 +102,7 @@ eval_rows = client.datasetio.get_rows_paginated(
client.eval_tasks.register(
eval_task_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"]
scoring_functions=["llm-as-judge::405b-simpleqa"],
)
response = client.eval.evaluate_rows(
@ -120,8 +121,8 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096,
"repeat_penalty": 1.0,
},
}
}
},
},
)
```
@ -144,14 +145,14 @@ agent_config = {
{
"type": "brave_search",
"engine": "tavily",
"api_key": userdata.get("TAVILY_SEARCH_API_KEY")
"api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
}
],
"tool_choice": "auto",
"tool_prompt_format": "json",
"input_shields": [],
"output_shields": [],
"enable_session_persistence": False
"enable_session_persistence": False,
}
response = client.eval.evaluate_rows(
@ -163,7 +164,7 @@ response = client.eval.evaluate_rows(
"eval_candidate": {
"type": "agent",
"config": agent_config,
}
}
},
},
)
```

View file

@ -13,7 +13,7 @@ Here's how to set up basic evaluation:
response = client.eval_tasks.register(
eval_task_id="my_eval",
dataset_id="my_dataset",
scoring_functions=["accuracy", "relevance"]
scoring_functions=["accuracy", "relevance"],
)
# Run evaluation
@ -21,16 +21,10 @@ job = client.eval.run_eval(
task_id="my_eval",
task_config={
"type": "app",
"eval_candidate": {
"type": "agent",
"config": agent_config
}
}
"eval_candidate": {"type": "agent", "config": agent_config},
},
)
# Get results
result = client.eval.job_result(
task_id="my_eval",
job_id=job.job_id
)
result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
```

View file

@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a
The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
Here are some key topics that will help you build effective agents:

View file

@ -34,15 +34,15 @@ chunks = [
{
"document_id": "doc1",
"content": "Your document text here",
"mime_type": "text/plain"
"mime_type": "text/plain",
},
...
]
client.vector_io.insert(vector_db_id, chunks)
client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
# You can then query for these chunks
chunks_response = client.vector_io.query(vector_db_id, query="What do you know about...")
chunks_response = client.vector_io.query(
vector_db_id=vector_db_id, query="What do you know about..."
)
```
### Using the RAG Tool
@ -71,8 +71,8 @@ client.tool_runtime.rag_tool.insert(
# Query documents
results = client.tool_runtime.rag_tool.query(
vector_db_id=vector_db_id,
query="What do you know about...",
vector_db_ids=[vector_db_id],
content="What do you know about...",
)
```
@ -81,19 +81,22 @@ results = client.tool_runtime.rag_tool.query(
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
```python
from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.lib.agents.agent import Agent
# Configure agent with memory
agent_config = AgentConfig(
model="Llama3.2-3B-Instruct",
model="meta-llama/Llama-3.2-3B-Instruct",
instructions="You are a helpful assistant",
enable_session_persistence=False,
toolgroups=[
{
"name": "builtin::rag",
"args": {
"vector_db_ids": [vector_db_id],
}
},
}
]
],
)
agent = Agent(client, agent_config)
@ -101,25 +104,21 @@ session_id = agent.create_session("rag_session")
# Initial document ingestion
response = agent.create_turn(
messages=[{
"role": "user",
"content": "I am providing some documents for reference."
}],
documents=[
dict(
content="https://raw.githubusercontent.com/example/doc.rst",
mime_type="text/plain"
)
messages=[
{"role": "user", "content": "I am providing some documents for reference."}
],
session_id=session_id
documents=[
{
"content": "https://raw.githubusercontent.com/example/doc.rst",
"mime_type": "text/plain",
}
],
session_id=session_id,
)
# Query with RAG
response = agent.create_turn(
messages=[{
"role": "user",
"content": "What are the key topics in the documents?"
}],
session_id=session_id
messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
session_id=session_id,
)
```

View file

@ -5,15 +5,11 @@ Safety is a critical component of any AI application. Llama Stack provides a Shi
```python
# Register a safety shield
shield_id = "content_safety"
client.shields.register(
shield_id=shield_id,
provider_shield_id="llama-guard-basic"
)
client.shields.register(shield_id=shield_id, provider_shield_id="llama-guard-basic")
# Run content through shield
response = client.safety.run_shield(
shield_id=shield_id,
messages=[{"role": "user", "content": "User message here"}]
shield_id=shield_id, messages=[{"role": "user", "content": "User message here"}]
)
if response.violation:

View file

@ -8,24 +8,16 @@ The telemetry system supports three main types of events:
- **Unstructured Log Events**: Free-form log messages with severity levels
```python
unstructured_log_event = UnstructuredLogEvent(
message="This is a log message",
severity=LogSeverity.INFO
message="This is a log message", severity=LogSeverity.INFO
)
```
- **Metric Events**: Numerical measurements with units
```python
metric_event = MetricEvent(
metric="my_metric",
value=10,
unit="count"
)
metric_event = MetricEvent(metric="my_metric", value=10, unit="count")
```
- **Structured Log Events**: System events like span start/end. Extensible to add more structured log types.
```python
structured_log_event = SpanStartPayload(
name="my_span",
parent_span_id="parent_span_id"
)
structured_log_event = SpanStartPayload(name="my_span", parent_span_id="parent_span_id")
```
### Spans and Traces

View file

@ -35,7 +35,7 @@ Example client SDK call to register a "websearch" toolgroup that is provided by
client.toolgroups.register(
toolgroup_id="builtin::websearch",
provider_id="brave-search",
args={"max_results": 5}
args={"max_results": 5},
)
```
@ -50,8 +50,7 @@ The Code Interpreter allows execution of Python code within a controlled environ
```python
# Register Code Interpreter tool group
client.toolgroups.register(
toolgroup_id="builtin::code_interpreter",
provider_id="code_interpreter"
toolgroup_id="builtin::code_interpreter", provider_id="code_interpreter"
)
```
@ -68,16 +67,14 @@ The WolframAlpha tool provides access to computational knowledge through the Wol
```python
# Register WolframAlpha tool group
client.toolgroups.register(
toolgroup_id="builtin::wolfram_alpha",
provider_id="wolfram-alpha"
toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
)
```
Example usage:
```python
result = client.tool_runtime.invoke_tool(
tool_name="wolfram_alpha",
args={"query": "solve x^2 + 2x + 1 = 0"}
tool_name="wolfram_alpha", args={"query": "solve x^2 + 2x + 1 = 0"}
)
```
@ -90,10 +87,7 @@ The Memory tool enables retrieval of context from various types of memory banks
client.toolgroups.register(
toolgroup_id="builtin::memory",
provider_id="memory",
args={
"max_chunks": 5,
"max_tokens_in_context": 4096
}
args={"max_chunks": 5, "max_tokens_in_context": 4096},
)
```
@ -136,9 +130,7 @@ config = AgentConfig(
toolgroups=[
"builtin::websearch",
],
client_tools=[
ToolDef(name="client_tool", description="Client provided tool")
]
client_tools=[ToolDef(name="client_tool", description="Client provided tool")],
)
```
@ -167,9 +159,9 @@ Example tool definition:
"name": "query",
"parameter_type": "string",
"description": "The query to search for",
"required": True
"required": True,
}
]
],
}
```
@ -179,8 +171,7 @@ Tools can be invoked using the `invoke_tool` method:
```python
result = client.tool_runtime.invoke_tool(
tool_name="web_search",
kwargs={"query": "What is the capital of France?"}
tool_name="web_search", kwargs={"query": "What is the capital of France?"}
)
```

View file

@ -62,10 +62,3 @@ While there is a lot of flexibility to mix-and-match providers, often users will
**On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.)
```{toctree}
:maxdepth: 1
:hidden:
distributions/index
```

View file

@ -68,6 +68,7 @@ myst_substitutions = {
"docker_hub": "https://hub.docker.com/repository/docker/llamastack",
}
suppress_warnings = ['myst.header']
# Copy button settings
copybutton_prompt_text = "$ " # for bash prompts
@ -94,22 +95,6 @@ html_static_path = ["../_static"]
# html_logo = "../_static/llama-stack-logo.png"
html_style = "../_static/css/my_theme.css"
redoc = [
{
"name": "Llama Stack API",
"page": "references/api_reference/index",
"spec": "../resources/llama-stack-spec.yaml",
"opts": {
"suppress-warnings": True,
# "expand-responses": ["200", "201"],
},
"embed": True,
},
]
redoc_uri = "https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js"
def setup(app):
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
url = f"https://hub.docker.com/r/llamastack/{text}"

View file

@ -3,7 +3,7 @@
This guide will walk you through the process of adding a new API provider to Llama Stack.
- Begin by reviewing the [core concepts](../concepts/) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
- Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary.

View file

@ -180,12 +180,45 @@ After this step is successful, you should be able to find the built container im
### Running your Stack server
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
```
llama stack run -h
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE]
[--tls-certfile TLS_CERTFILE] [--image-type {conda,container,venv}]
config
start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
positional arguments:
config Path to config file to use for the run
options:
-h, --help show this help message and exit
--port PORT Port to run the server on. Defaults to 8321
--image-name IMAGE_NAME
Name of the image to run. Defaults to the current conda environment
--disable-ipv6 Disable IPv6 support
--env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.
--tls-keyfile TLS_KEYFILE
Path to TLS key file for HTTPS
--tls-certfile TLS_CERTFILE
Path to TLS certificate file for HTTPS
--image-type {conda,container,venv}
Image Type used during the build. This can be either conda or container or venv.
```
```
# Start using template name
llama stack run tgi
# Start using config file
llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
# Start using a venv
llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
# Start using a conda environment
llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
```
```

View file

@ -1,9 +1,9 @@
# Using Llama Stack as a Library
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
```python
```bash
# setup
pip install llama-stack
uv pip install llama-stack
llama stack build --template together --image-type venv
```
@ -13,7 +13,7 @@ from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient(
"ollama",
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
provider_data = {"tavily_search_api_key": os.environ['TAVILY_SEARCH_API_KEY']}
provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
)
await client.initialize()
```

View file

@ -7,14 +7,19 @@ You can run a Llama Stack server in one of the following ways:
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
**Docker**:
**Container**:
Another simple way to start interacting with Llama Stack is to just spin up docker which is pre-built with all the providers you need. We provide a number of pre-built Docker containers so you can start a Llama Stack server instantly. You can also build your own custom Docker container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](distributions/selection) for more details.
Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
**Conda**:
Lastly, if you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
**Kubernetes**:
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
```{toctree}
@ -24,4 +29,6 @@ Lastly, if you have a custom or an advanced setup or you are developing on Llama
importing_as_library
building_distro
configuration
selection
kubernetes_deployment
```

View file

@ -0,0 +1,207 @@
# Kubernetes Deployment Guide
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
First, create a local Kubernetes cluster via Kind:
```bash
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
```
Start vLLM server as a Kubernetes Pod and Service:
```bash
cat <<EOF |kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-models
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
data:
token: $(HF_TOKEN)
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm
template:
metadata:
labels:
app.kubernetes.io/name: vllm
spec:
containers:
- name: llama-stack
image: $(VLLM_IMAGE)
command:
- bash
- -c
- |
MODEL="meta-llama/Llama-3.2-1B-Instruct"
MODEL_PATH=/app/model/$(basename $MODEL)
huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /app/model
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server
spec:
selector:
app.kubernetes.io/name: vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
EOF
```
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
```bash
$ kubectl logs -l app.kubernetes.io/name=vllm
...
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
```
Then we can modify the Llama Stack run configuration YAML with the following inference provider:
```yaml
providers:
inference:
- provider_id: vllm
provider_type: remote::vllm
config:
url: http://vllm-server.default.svc.cluster.local:8000/v1
max_tokens: 4096
api_token: fake
```
Once we have defined the run configuration for Llama Stack, we can build an image with that configuration and the server source code:
```bash
cat >/tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s <<EOF
FROM distribution-myenv:dev
RUN apt-get update && apt-get install -y git
RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
EOF
podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
```
We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
```bash
cat <<EOF |kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llama-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-stack-server
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: llama-stack
template:
metadata:
labels:
app.kubernetes.io/name: llama-stack
spec:
containers:
- name: llama-stack
image: localhost/llama-stack-run-k8s:latest
imagePullPolicy: IfNotPresent
command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
ports:
- containerPort: 5000
volumeMounts:
- name: llama-storage
mountPath: /root/.llama
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: llama-pvc
---
apiVersion: v1
kind: Service
metadata:
name: llama-stack-service
spec:
selector:
app.kubernetes.io/name: llama-stack
ports:
- protocol: TCP
port: 5000
targetPort: 5000
type: ClusterIP
EOF
```
We can check that the LlamaStack server has started:
```bash
$ kubectl logs -l app.kubernetes.io/name=llama-stack
...
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: ASGI 'lifespan' protocol appears unsupported.
INFO: Application startup complete.
INFO: Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
```
Finally, we forward the Kubernetes service to a local port and test some inference requests against it via the Llama Stack Client:
```bash
kubectl port-forward service/llama-stack-service 5000:5000
llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
```

View file

@ -1,3 +1,4 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# NVIDIA Distribution
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.

View file

@ -23,7 +23,7 @@ Which templates / distributions to choose depends on the hardware you have for r
- {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together))
- {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks))
- **Do you want to run Llama Stack inference on your iOS / Android device** Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device:
- **Do you want to run Llama Stack inference on your iOS / Android device?** Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device:
- [iOS SDK](ondevice_distro/ios_sdk)
- [Android](ondevice_distro/android_sdk)
@ -43,7 +43,6 @@ self_hosted_distro/nvidia
self_hosted_distro/ollama
self_hosted_distro/together
self_hosted_distro/fireworks
ondevice_distro/index
```
### On-Device Distributions

View file

@ -1,3 +1,4 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Bedrock Distribution
```{toctree}

View file

@ -1,3 +1,4 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Cerebras Distribution
The `llamastack/distribution-cerebras` distribution consists of the following provider configurations.

View file

@ -0,0 +1,186 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Dell Distribution of Llama Stack
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-dell` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::tgi` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
### Environment Variables
The following environment variables can be configured:
- `DEH_URL`: URL for the Dell inference server (default: `http://0.0.0.0:8181`)
- `DEH_SAFETY_URL`: URL for the Dell safety inference server (default: `http://0.0.0.0:8282`)
- `CHROMA_URL`: URL for the Chroma server (default: `http://localhost:6601`)
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
```bash
export INFERENCE_PORT=8181
export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
export CHROMADB_HOST=localhost
export CHROMADB_PORT=6601
export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
export CUDA_VISIBLE_DEVICES=0
export LLAMA_STACK_PORT=8321
docker run --rm -it \
--network host \
-v $HOME/.cache/huggingface:/data \
-e HF_TOKEN=$HF_TOKEN \
-p $INFERENCE_PORT:$INFERENCE_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--cuda-memory-fraction 0.7 \
--model-id $INFERENCE_MODEL \
--port $INFERENCE_PORT --hostname 0.0.0.0
```
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
```bash
export SAFETY_INFERENCE_PORT=8282
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export CUDA_VISIBLE_DEVICES=1
docker run --rm -it \
--network host \
-v $HOME/.cache/huggingface:/data \
-e HF_TOKEN=$HF_TOKEN \
-p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--cuda-memory-fraction 0.7 \
--model-id $SAFETY_MODEL \
--hostname 0.0.0.0 \
--port $SAFETY_INFERENCE_PORT
```
## Dell distribution relies on ChromaDB for vector database usage
You can start a chroma-db easily using docker.
```bash
# This is where the indices are persisted
mkdir -p $HOME/chromadb
podman run --rm -it \
--network host \
--name chromadb \
-v $HOME/chromadb:/chroma/chroma \
-e IS_PERSISTENT=TRUE \
chromadb/chroma:latest \
--port $CHROMADB_PORT \
--host $CHROMADB_HOST
```
## Running Llama Stack
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
docker run -it \
--network host \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v $HOME/.llama:/root/.llama \
# NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
-v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \
# localhost/distribution-dell:dev if building / testing locally
llamastack/distribution-dell\
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
# You need a local checkout of llama-stack to run this, get it using
# git clone https://github.com/meta-llama/llama-stack.git
cd /path/to/llama-stack
export SAFETY_INFERENCE_PORT=8282
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v $HOME/.llama:/root/.llama \
-v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-dell \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
```
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template dell --image-type conda
llama stack run dell
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
```

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Fireworks Distribution
```{toctree}

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Meta Reference Distribution
```{toctree}
@ -82,7 +83,7 @@ docker run \
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template meta-reference-gpu --image-type conda

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Meta Reference Quantized Distribution
```{toctree}
@ -82,7 +83,7 @@ docker run \
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template meta-reference-quantized-gpu --image-type conda

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Ollama Distribution
```{toctree}
@ -25,7 +26,9 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
### Environment Variables
The following environment variables can be configured:
@ -101,7 +104,7 @@ docker run \
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash
export LLAMA_STACK_PORT=5001

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Remote vLLM Distribution
```{toctree}
:maxdepth: 2
@ -131,7 +132,7 @@ docker run \
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash
export INFERENCE_PORT=8000

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# SambaNova Distribution
```{toctree}
@ -38,13 +39,15 @@ The following models are available by default:
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (Meta-Llama-3.1-405B-Instruct)`
- `meta-llama/Llama-3.2-1B-Instruct (Meta-Llama-3.2-1B-Instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (Meta-Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct (Meta-Llama-3.3-70B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (Llama-3.2-90B-Vision-Instruct)`
- `meta-llama/Llama-Guard-3-8B (Meta-Llama-Guard-3-8B)`
### Prerequisite: API Keys
Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaBova.ai](https://sambanova.ai/).
Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](https://sambanova.ai/).
## Running Llama Stack with SambaNova

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# TGI Distribution
@ -122,7 +123,7 @@ docker run \
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template tgi --image-type conda

View file

@ -1,6 +1,7 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Together Distribution
```{toctree}

View file

@ -1,6 +1,6 @@
# Quick Start
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK ) to test a simple RAG agent.
In this guide, we'll walk through how you can use the Llama Stack (server and client SDK) to test a simple RAG agent.
A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.
@ -15,8 +15,11 @@ ollama run llama3.2:3b-instruct-fp16 --keepalive 60m
By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime.
NOTE: If you do not have ollama, you can install it from [here](https://ollama.ai/docs/installation).
```{admonition} Note
:class: tip
If you do not have ollama, you can install it from [here](https://ollama.com/download).
```
### 2. Pick a client environment
@ -35,15 +38,20 @@ The API is **exactly identical** for both clients.
:::{dropdown} Starting up the Llama Stack server
The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.
To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image.
To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image.
Lets setup some environment variables that we will use in the rest of the guide.
```bash
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
LLAMA_STACK_PORT=8321
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
export LLAMA_STACK_PORT=8321
```
You can start the server using the following command:
Next you can create a local directory to mount into the containers file system.
```bash
mkdir -p ~/.llama
```
Then you can start the server using the container tool of your choice. For example, if you are running Docker you can use the following command:
```bash
docker run -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -53,8 +61,28 @@ docker run -it \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
```
As another example, to start the container with Podman, you can do the same but replace `docker` at the start of the command with `podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL` with `host.containers.internal`.
Configuration for this is available at `distributions/ollama/run.yaml`.
```{admonition} Note
:class: note
Docker containers run in their own isolated network namespaces on Linux. To allow the container to communicate with services running on the host via `localhost`, you need `--network=host`. This makes the container use the hosts network directly so it can connect to Ollama running on `localhost:11434`.
Linux users having issues running the above command should instead try the following:
```bash
docker run -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
--network=host \
llamastack/distribution-ollama \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://localhost:11434
```
:::
@ -71,8 +99,10 @@ pip install llama-stack-client
Let's use the `llama-stack-client` CLI to check the connectivity to the server.
```bash
llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT
llama-stack-client models list
$ llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT
> Enter the API key (leave empty if no key is needed):
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
$ llama-stack-client models list
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ identifier ┃ provider_id ┃ provider_resource_id ┃ metadata ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
@ -95,19 +125,30 @@ llama-stack-client \
Here is a simple example to perform chat completions using the SDK.
```python
import os
import sys
def create_http_client():
from llama_stack_client import LlamaStackClient
return LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
return LlamaStackClient(
base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
)
def create_library_client(template="ollama"):
from llama_stack import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient(template)
client.initialize()
if not client.initialize():
print("llama stack not built properly")
sys.exit(1)
return client
client = create_library_client() # or create_http_client() depending on the environment you picked
client = (
create_library_client()
) # or create_http_client() depending on the environment you picked
# List available models
models = client.models.list()
@ -120,8 +161,8 @@ response = client.inference.chat_completion(
model_id=os.environ["INFERENCE_MODEL"],
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a haiku about coding"}
]
{"role": "user", "content": "Write a haiku about coding"},
],
)
print(response.completion_message.content)
```
@ -132,6 +173,7 @@ Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agen
```python
import os
import uuid
from termcolor import cprint
from llama_stack_client.lib.agents.agent import Agent
@ -139,10 +181,29 @@ from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.types import Document
client = create_library_client() # or create_http_client() depending on the environment you picked
def create_http_client():
from llama_stack_client import LlamaStackClient
return LlamaStackClient(
base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
)
def create_library_client(template="ollama"):
from llama_stack import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient(template)
client.initialize()
return client
client = (
create_library_client()
) # or create_http_client() depending on the environment you picked
# Documents to be used for RAG
urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"]
urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
documents = [
Document(
document_id=f"num-{i}",
@ -154,7 +215,7 @@ documents = [
]
# Register a vector database
vector_db_id = "test-vector-db"
vector_db_id = f"test-vector-db-{uuid.uuid4().hex}"
client.vector_dbs.register(
vector_db_id=vector_db_id,
embedding_model="all-MiniLM-L6-v2",
@ -174,12 +235,12 @@ agent_config = AgentConfig(
instructions="You are a helpful assistant",
enable_session_persistence=False,
# Define tools available to the agent
toolgroups = [
toolgroups=[
{
"name": "builtin::rag",
"args" : {
"vector_db_ids": [vector_db_id],
}
"name": "builtin::rag",
"args": {
"vector_db_ids": [vector_db_id],
},
}
],
)
@ -193,7 +254,7 @@ user_prompts = [
# Run the agent loop by calling the `create_turn` method
for prompt in user_prompts:
cprint(f'User> {prompt}', 'green')
cprint(f"User> {prompt}", "green")
response = rag_agent.create_turn(
messages=[{"role": "user", "content": prompt}],
session_id=session_id,

View file

@ -1,7 +1,8 @@
```{admonition} News
:class: tip
Llama Stack 0.1.0 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.0) for more details.
Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details.
```
# Llama Stack

View file

@ -1,7 +1,6 @@
{.hide-title}
# API Reference
```{eval-rst}
.. sphinxcontrib-redoc:: ../resources/llama-stack-spec.yaml
:page-title: API Reference
:expand-responses: all
```{raw} html
:file: ../../../_static/llama-stack-spec.html
```

View file

@ -12,7 +12,7 @@ This guide goes over the sets of APIs and developer experience flow of using Lla
## Evaluation Concepts
The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding.
The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../../concepts/index.md) guide for better high-level understanding.
![Eval Concepts](./resources/eval-concept.png)
@ -51,6 +51,7 @@ This first example walks you through how to evaluate a model candidate served by
```python
import datasets
ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev")
ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"])
eval_rows = ds.to_pandas().to_dict(orient="records")
@ -79,7 +80,7 @@ system_message = {
client.eval_tasks.register(
eval_task_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"]
scoring_functions=["basic::regex_parser_multiple_choice_answer"],
)
response = client.eval.evaluate_rows(
@ -98,9 +99,9 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096,
"repeat_penalty": 1.0,
},
"system_message": system_message
}
}
"system_message": system_message,
},
},
)
```
@ -124,7 +125,7 @@ _ = client.datasets.register(
"input_query": {"type": "string"},
"expected_answer": {"type": "string"},
"chat_completion_input": {"type": "chat_completion_input"},
}
},
)
eval_rows = client.datasetio.get_rows_paginated(
@ -137,7 +138,7 @@ eval_rows = client.datasetio.get_rows_paginated(
client.eval_tasks.register(
eval_task_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"]
scoring_functions=["llm-as-judge::405b-simpleqa"],
)
response = client.eval.evaluate_rows(
@ -156,8 +157,8 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096,
"repeat_penalty": 1.0,
},
}
}
},
},
)
```
@ -180,14 +181,14 @@ agent_config = {
{
"type": "brave_search",
"engine": "tavily",
"api_key": userdata.get("TAVILY_SEARCH_API_KEY")
"api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
}
],
"tool_choice": "auto",
"tool_prompt_format": "json",
"input_shields": [],
"output_shields": [],
"enable_session_persistence": False
"enable_session_persistence": False,
}
response = client.eval.evaluate_rows(
@ -199,8 +200,8 @@ response = client.eval.evaluate_rows(
"eval_candidate": {
"type": "agent",
"config": agent_config,
}
}
},
},
)
```
@ -237,7 +238,9 @@ GENERATED_RESPONSE: {generated_answer}
EXPECTED_RESPONSE: {expected_answer}
"""
input_query = "What are the top 5 topics that were explained? Only list succinct bullet points."
input_query = (
"What are the top 5 topics that were explained? Only list succinct bullet points."
)
generated_answer = """
Here are the top 5 topics that were explained in the documentation for Torchtune:
@ -268,7 +271,9 @@ scoring_params = {
"braintrust::factuality": None,
}
response = client.scoring.score(input_rows=dataset_rows, scoring_functions=scoring_params)
response = client.scoring.score(
input_rows=dataset_rows, scoring_functions=scoring_params
)
```
## Running Evaluations via CLI

View file

@ -33,7 +33,11 @@ from llama_stack_client.types import (
Types:
```python
from llama_stack_client.types import ListToolGroupsResponse, ToolGroup, ToolgroupListResponse
from llama_stack_client.types import (
ListToolGroupsResponse,
ToolGroup,
ToolgroupListResponse,
)
```
Methods:
@ -444,7 +448,11 @@ Methods:
Types:
```python
from llama_stack_client.types import EvalTask, ListEvalTasksResponse, EvalTaskListResponse
from llama_stack_client.types import (
EvalTask,
ListEvalTasksResponse,
EvalTaskListResponse,
)
```
Methods:

View file

@ -45,7 +45,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
---
## Install Dependencies and Set Up Environmen
## Install Dependencies and Set Up Environment
1. **Create a Conda Environment**:
Create a new Conda environment with Python 3.10:
@ -73,7 +73,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
Open a new terminal and install `llama-stack`:
```bash
conda activate ollama
pip install llama-stack==0.0.61
pip install llama-stack==0.1.0
```
---
@ -110,7 +110,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
--env SAFETY_MODEL=$SAFETY_MODEL
--env OLLAMA_URL=$OLLAMA_URL
```
Note: Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
The server will start and listen on `http://localhost:5001`.
@ -191,7 +191,7 @@ You can check the available models with the command `llama-stack-client models l
You can also interact with the Llama Stack server using a simple Python script. Below is an example:
### 1. Activate Conda Environmen
### 1. Activate Conda Environment
```bash
conda activate ollama
@ -208,7 +208,7 @@ In `test_llama_stack.py`, write the following code:
```python
import os
from llama_stack_client import LlamaStackClien
from llama_stack_client import LlamaStackClient
# Get the model ID from the environment variable
INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL")
@ -224,7 +224,7 @@ client = LlamaStackClient(base_url="http://localhost:5001")
response = client.inference.chat_completion(
messages=[
{"role": "system", "content": "You are a friendly assistant."},
{"role": "user", "content": "Write a two-sentence poem about llama."}
{"role": "user", "content": "Write a two-sentence poem about llama."},
],
model_id=INFERENCE_MODEL,
)

View file

@ -15,20 +15,21 @@ from typing import (
Literal,
Optional,
Protocol,
runtime_checkable,
Union,
runtime_checkable,
)
from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, ConfigDict, Field
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, URL
from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
from llama_stack.apis.inference import (
CompletionMessage,
ResponseFormat,
SamplingParams,
ToolCall,
ToolChoice,
ToolConfig,
ToolPromptFormat,
ToolResponse,
ToolResponseMessage,
@ -86,9 +87,7 @@ class ShieldCallStep(StepCommon):
@json_schema_type
class MemoryRetrievalStep(StepCommon):
step_type: Literal[StepType.memory_retrieval.value] = (
StepType.memory_retrieval.value
)
step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value
vector_db_ids: str
inserted_context: InterleavedContent
@ -118,7 +117,7 @@ class Turn(BaseModel):
]
steps: List[Step]
output_message: CompletionMessage
output_attachments: List[Attachment] = Field(default_factory=list)
output_attachments: Optional[List[Attachment]] = Field(default_factory=list)
started_at: datetime
completed_at: Optional[datetime] = None
@ -155,10 +154,25 @@ class AgentConfigCommon(BaseModel):
output_shields: Optional[List[str]] = Field(default_factory=list)
toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list)
client_tools: Optional[List[ToolDef]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
tool_choice: Optional[ToolChoice] = Field(default=None, deprecated="use tool_config instead")
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None, deprecated="use tool_config instead")
tool_config: Optional[ToolConfig] = Field(default=None)
max_infer_iters: int = 10
max_infer_iters: Optional[int] = 10
def model_post_init(self, __context):
if self.tool_config:
if self.tool_choice and self.tool_config.tool_choice != self.tool_choice:
raise ValueError("tool_choice is deprecated. Use tool_choice in tool_config instead.")
if self.tool_prompt_format and self.tool_config.tool_prompt_format != self.tool_prompt_format:
raise ValueError("tool_prompt_format is deprecated. Use tool_prompt_format in tool_config instead.")
else:
params = {}
if self.tool_choice:
params["tool_choice"] = self.tool_choice
if self.tool_prompt_format:
params["tool_prompt_format"] = self.tool_prompt_format
self.tool_config = ToolConfig(**params)
@json_schema_type
@ -184,9 +198,7 @@ class AgentTurnResponseEventType(Enum):
@json_schema_type
class AgentTurnResponseStepStartPayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.step_start.value] = (
AgentTurnResponseEventType.step_start.value
)
event_type: Literal[AgentTurnResponseEventType.step_start.value] = AgentTurnResponseEventType.step_start.value
step_type: StepType
step_id: str
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
@ -194,9 +206,7 @@ class AgentTurnResponseStepStartPayload(BaseModel):
@json_schema_type
class AgentTurnResponseStepCompletePayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.step_complete.value] = (
AgentTurnResponseEventType.step_complete.value
)
event_type: Literal[AgentTurnResponseEventType.step_complete.value] = AgentTurnResponseEventType.step_complete.value
step_type: StepType
step_id: str
step_details: Step
@ -206,9 +216,7 @@ class AgentTurnResponseStepCompletePayload(BaseModel):
class AgentTurnResponseStepProgressPayload(BaseModel):
model_config = ConfigDict(protected_namespaces=())
event_type: Literal[AgentTurnResponseEventType.step_progress.value] = (
AgentTurnResponseEventType.step_progress.value
)
event_type: Literal[AgentTurnResponseEventType.step_progress.value] = AgentTurnResponseEventType.step_progress.value
step_type: StepType
step_id: str
@ -217,17 +225,13 @@ class AgentTurnResponseStepProgressPayload(BaseModel):
@json_schema_type
class AgentTurnResponseTurnStartPayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.turn_start.value] = (
AgentTurnResponseEventType.turn_start.value
)
event_type: Literal[AgentTurnResponseEventType.turn_start.value] = AgentTurnResponseEventType.turn_start.value
turn_id: str
@json_schema_type
class AgentTurnResponseTurnCompletePayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = (
AgentTurnResponseEventType.turn_complete.value
)
event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = AgentTurnResponseEventType.turn_complete.value
turn: Turn
@ -280,6 +284,7 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
toolgroups: Optional[List[AgentToolGroup]] = None
stream: Optional[bool] = False
tool_config: Optional[ToolConfig] = None
@json_schema_type
@ -297,6 +302,16 @@ class AgentStepResponse(BaseModel):
@runtime_checkable
@trace_protocol
class Agents(Protocol):
"""Agents API for creating and interacting with agentic systems.
Main functionalities provided by this API:
- Create agents with specific instructions and ability to use tools.
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
- Agents can be provided with various shields (see the Safety API for more details).
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
"""
@webmethod(route="/agents", method="POST")
async def create_agent(
self,
@ -317,10 +332,12 @@ class Agents(Protocol):
stream: Optional[bool] = False,
documents: Optional[List[Document]] = None,
toolgroups: Optional[List[AgentToolGroup]] = None,
tool_config: Optional[ToolConfig] = None,
) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ...
@webmethod(
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}", method="GET"
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
method="GET",
)
async def get_agents_turn(
self,

View file

@ -13,7 +13,6 @@ from termcolor import cprint
from llama_stack.apis.agents import AgentTurnResponseEventType, StepType
from llama_stack.apis.common.content_types import ToolCallParseStatus
from llama_stack.apis.inference import ToolResponseMessage
from llama_stack.providers.utils.inference.prompt_adapter import (
interleaved_content_as_str,
)
@ -63,9 +62,7 @@ class EventLogger:
if isinstance(chunk, ToolResponseMessage):
yield (
chunk,
LogEvent(
role="CustomTool", content=chunk.content, color="grey"
),
LogEvent(role="CustomTool", content=chunk.content, color="grey"),
)
continue
@ -81,17 +78,12 @@ class EventLogger:
step_type = event.payload.step_type
# handle safety
if (
step_type == StepType.shield_call
and event_type == EventType.step_complete.value
):
if step_type == StepType.shield_call and event_type == EventType.step_complete.value:
violation = event.payload.step_details.violation
if not violation:
yield (
event,
LogEvent(
role=step_type, content="No Violation", color="magenta"
),
LogEvent(role=step_type, content="No Violation", color="magenta"),
)
else:
yield (
@ -110,9 +102,7 @@ class EventLogger:
# TODO: Currently this event is never received
yield (
event,
LogEvent(
role=step_type, content="", end="", color="yellow"
),
LogEvent(role=step_type, content="", end="", color="yellow"),
)
elif event_type == EventType.step_progress.value:
# HACK: if previous was not step/event was not inference's step_progress
@ -125,9 +115,7 @@ class EventLogger:
):
yield (
event,
LogEvent(
role=step_type, content="", end="", color="yellow"
),
LogEvent(role=step_type, content="", end="", color="yellow"),
)
delta = event.payload.delta
@ -161,9 +149,7 @@ class EventLogger:
if event_type == EventType.step_complete.value:
response = event.payload.step_details.model_response
if response.tool_calls:
content = ToolUtils.encode_tool_call(
response.tool_calls[0], tool_prompt_format
)
content = ToolUtils.encode_tool_call(response.tool_calls[0], tool_prompt_format)
else:
content = response.content
yield (
@ -202,10 +188,7 @@ class EventLogger:
),
)
if (
step_type == StepType.memory_retrieval
and event_type == EventType.step_complete.value
):
if step_type == StepType.memory_retrieval and event_type == EventType.step_complete.value:
details = event.payload.step_details
inserted_context = interleaved_content_as_str(details.inserted_context)
content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}"

View file

@ -7,13 +7,15 @@
from typing import List, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from pydantic import BaseModel
from llama_stack.apis.inference import (
CompletionMessage,
ChatCompletionResponse,
CompletionResponse,
InterleavedContent,
LogProbConfig,
Message,
ResponseFormat,
SamplingParams,
ToolChoice,
ToolDefinition,
@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
)
@json_schema_type
class BatchCompletionRequest(BaseModel):
model: str
content_batch: List[InterleavedContent]
sampling_params: Optional[SamplingParams] = SamplingParams()
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class BatchCompletionResponse(BaseModel):
completion_message_batch: List[CompletionMessage]
@json_schema_type
class BatchChatCompletionRequest(BaseModel):
model: str
messages_batch: List[List[Message]]
sampling_params: Optional[SamplingParams] = SamplingParams()
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
logprobs: Optional[LogProbConfig] = None
batch: List[CompletionResponse]
@json_schema_type
class BatchChatCompletionResponse(BaseModel):
completion_message_batch: List[CompletionMessage]
batch: List[ChatCompletionResponse]
@runtime_checkable
@ -60,6 +41,7 @@ class BatchInference(Protocol):
model: str,
content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = SamplingParams(),
response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
) -> BatchCompletionResponse: ...
@ -73,5 +55,6 @@ class BatchInference(Protocol):
tools: Optional[List[ToolDefinition]] = list,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None,
response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None,
) -> BatchChatCompletionResponse: ...

View file

@ -4,14 +4,12 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import base64
from enum import Enum
from typing import Annotated, List, Literal, Optional, Union
from llama_models.llama3.api.datatypes import ToolCall
from llama_models.schema_utils import json_schema_type, register_schema
from pydantic import BaseModel, Field, field_serializer, model_validator
from pydantic import BaseModel, Field, model_validator
@json_schema_type
@ -20,8 +18,16 @@ class URL(BaseModel):
class _URLOrData(BaseModel):
"""
A URL or a base64 encoded string
:param url: A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits.
:param data: base64 encoded image data as string
"""
url: Optional[URL] = None
data: Optional[bytes] = None
# data is a base64 encoded string, hint with contentEncoding=base64
data: Optional[str] = Field(contentEncoding="base64", default=None)
@model_validator(mode="before")
@classmethod
@ -30,21 +36,27 @@ class _URLOrData(BaseModel):
return values
return {"url": values}
@field_serializer("data")
def serialize_data(self, data: Optional[bytes], _info):
if data is None:
return None
return base64.b64encode(data).decode("utf-8")
@json_schema_type
class ImageContentItem(BaseModel):
"""A image content item
:param type: Discriminator type of the content item. Always "image"
:param image: Image as a base64 encoded string or an URL
"""
type: Literal["image"] = "image"
image: _URLOrData
@json_schema_type
class TextContentItem(BaseModel):
"""A text content item
:param type: Discriminator type of the content item. Always "text"
:param text: Text content
"""
type: Literal["text"] = "text"
text: str
@ -77,7 +89,6 @@ class ImageDelta(BaseModel):
image: bytes
@json_schema_type
class ToolCallParseStatus(Enum):
started = "started"
in_progress = "in_progress"

View file

@ -8,7 +8,6 @@ from enum import Enum
from typing import Any, Dict, Optional
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel
from llama_stack.apis.common.content_types import URL

View file

@ -39,6 +39,4 @@ class DatasetIO(Protocol):
) -> PaginatedRowsResult: ...
@webmethod(route="/datasetio/rows", method="POST")
async def append_rows(
self, dataset_id: str, rows: List[Dict[str, Any]]
) -> None: ...
async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...

View file

@ -58,7 +58,7 @@ class Datasets(Protocol):
metadata: Optional[Dict[str, Any]] = None,
) -> None: ...
@webmethod(route="/datasets/{dataset_id}", method="GET")
@webmethod(route="/datasets/{dataset_id:path}", method="GET")
async def get_dataset(
self,
dataset_id: str,
@ -67,7 +67,7 @@ class Datasets(Protocol):
@webmethod(route="/datasets", method="GET")
async def list_datasets(self) -> ListDatasetsResponse: ...
@webmethod(route="/datasets/{dataset_id}", method="DELETE")
@webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
async def unregister_dataset(
self,
dataset_id: str,

View file

@ -63,9 +63,7 @@ class AppEvalTaskConfig(BaseModel):
EvalTaskConfig = register_schema(
Annotated[
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
],
Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
name="EvalTaskConfig",
)

View file

@ -13,8 +13,8 @@ from typing import (
Literal,
Optional,
Protocol,
runtime_checkable,
Union,
runtime_checkable,
)
from llama_models.llama3.api.datatypes import (
@ -31,15 +31,27 @@ from typing_extensions import Annotated
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
from llama_stack.apis.models import Model
from llama_stack.apis.telemetry.telemetry import MetricResponseMixin
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
class LogProbConfig(BaseModel):
"""
:param top_k: How many tokens (for each position) to return log probabilities for.
"""
top_k: Optional[int] = 0
@json_schema_type
class QuantizationType(Enum):
"""Type of model quantization to run inference with.
:cvar bf16: BFloat16 typically this means _no_ quantization
:cvar fp8: 8-bit floating point quantization
:cvar int4: 4-bit integer quantization
"""
bf16 = "bf16"
fp8 = "fp8"
int4 = "int4"
@ -57,6 +69,12 @@ class Bf16QuantizationConfig(BaseModel):
@json_schema_type
class Int4QuantizationConfig(BaseModel):
"""Configuration for 4-bit integer quantization.
:param type: Must be "int4" to identify this quantization type
:param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation"
"""
type: Literal["int4"] = "int4"
scheme: Optional[str] = "int4_weight_int8_dynamic_activation"
@ -69,6 +87,13 @@ QuantizationConfig = Annotated[
@json_schema_type
class UserMessage(BaseModel):
"""A message from the user in a chat conversation.
:param role: Must be "user" to identify this as a user message
:param content: The content of the message, which can include text and other media
:param context: (Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future.
"""
role: Literal["user"] = "user"
content: InterleavedContent
context: Optional[InterleavedContent] = None
@ -76,15 +101,27 @@ class UserMessage(BaseModel):
@json_schema_type
class SystemMessage(BaseModel):
"""A system message providing instructions or context to the model.
:param role: Must be "system" to identify this as a system message
:param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
"""
role: Literal["system"] = "system"
content: InterleavedContent
@json_schema_type
class ToolResponseMessage(BaseModel):
"""A message representing the result of a tool invocation.
:param role: Must be "tool" to identify this as a tool response
:param call_id: Unique identifier for the tool call this response is for
:param tool_name: Name of the tool that was called
:param content: The response content from the tool
"""
role: Literal["tool"] = "tool"
# it was nice to re-use the ToolResponse type, but having all messages
# have a `content` type makes things nicer too
call_id: str
tool_name: Union[BuiltinTool, str]
content: InterleavedContent
@ -92,10 +129,21 @@ class ToolResponseMessage(BaseModel):
@json_schema_type
class CompletionMessage(BaseModel):
"""A message containing the model's (assistant) response in a chat conversation.
:param role: Must be "assistant" to identify this as the model's response
:param content: The content of the model's response
:param stop_reason: Reason why the model stopped generating. Options are:
- `StopReason.end_of_turn`: The model finished generating the entire response.
- `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
- `StopReason.out_of_tokens`: The model ran out of token budget.
:param tool_calls: List of tool calls. Each tool call is a ToolCall object.
"""
role: Literal["assistant"] = "assistant"
content: InterleavedContent
stop_reason: StopReason
tool_calls: List[ToolCall] = Field(default_factory=list)
tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)
Message = register_schema(
@ -129,19 +177,35 @@ class ToolResponse(BaseModel):
return v
@json_schema_type
class ToolChoice(Enum):
"""Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
:cvar auto: The model may use tools if it determines that is appropriate.
:cvar required: The model must use tools.
"""
auto = "auto"
required = "required"
@json_schema_type
class TokenLogProbs(BaseModel):
"""Log probabilities for generated tokens.
:param logprobs_by_token: Dictionary mapping tokens to their log probabilities
"""
logprobs_by_token: Dict[str, float]
@json_schema_type
class ChatCompletionResponseEventType(Enum):
"""Types of events that can occur during chat completion.
:cvar start: Inference has started
:cvar complete: Inference is complete and a full response is available
:cvar progress: Inference is in progress and a partial response is available
"""
start = "start"
complete = "complete"
progress = "progress"
@ -149,7 +213,13 @@ class ChatCompletionResponseEventType(Enum):
@json_schema_type
class ChatCompletionResponseEvent(BaseModel):
"""Chat completion response event."""
"""An event during chat completion generation.
:param event_type: Type of the event
:param delta: Content generated since last event. This can be one or more tokens, or a tool call.
:param logprobs: Optional log probabilities for generated tokens
:param stop_reason: Optional reason why generation stopped, if complete
"""
event_type: ChatCompletionResponseEventType
delta: ContentDelta
@ -157,22 +227,37 @@ class ChatCompletionResponseEvent(BaseModel):
stop_reason: Optional[StopReason] = None
@json_schema_type
class ResponseFormatType(Enum):
"""Types of formats for structured (guided) decoding.
:cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
:cvar grammar: Response should conform to a BNF grammar
"""
json_schema = "json_schema"
grammar = "grammar"
@json_schema_type
class JsonSchemaResponseFormat(BaseModel):
type: Literal[ResponseFormatType.json_schema.value] = (
ResponseFormatType.json_schema.value
)
"""Configuration for JSON schema-guided response generation.
:param type: Must be "json_schema" to identify this format type
:param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
"""
type: Literal[ResponseFormatType.json_schema.value] = ResponseFormatType.json_schema.value
json_schema: Dict[str, Any]
@json_schema_type
class GrammarResponseFormat(BaseModel):
"""Configuration for grammar-guided response generation.
:param type: Must be "grammar" to identify this format type
:param bnf: The BNF grammar specification the response should conform to
"""
type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
bnf: Dict[str, Any]
@ -186,20 +271,24 @@ ResponseFormat = register_schema(
)
@json_schema_type
# This is an internally used class
class CompletionRequest(BaseModel):
model: str
content: InterleavedContent
sampling_params: Optional[SamplingParams] = SamplingParams()
response_format: Optional[ResponseFormat] = None
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class CompletionResponse(BaseModel):
"""Completion response."""
"""Response from a completion request.
:param content: The generated completion text
:param stop_reason: Reason why generation stopped
:param logprobs: Optional log probabilities for generated tokens
"""
content: str
stop_reason: StopReason
@ -208,80 +297,95 @@ class CompletionResponse(BaseModel):
@json_schema_type
class CompletionResponseStreamChunk(BaseModel):
"""streamed completion response."""
"""A chunk of a streamed completion response.
:param delta: New content generated since last chunk. This can be one or more tokens.
:param stop_reason: Optional reason why generation stopped, if complete
:param logprobs: Optional log probabilities for generated tokens
"""
delta: str
stop_reason: Optional[StopReason] = None
logprobs: Optional[List[TokenLogProbs]] = None
@json_schema_type
class BatchCompletionRequest(BaseModel):
model: str
content_batch: List[InterleavedContent]
sampling_params: Optional[SamplingParams] = SamplingParams()
response_format: Optional[ResponseFormat] = None
logprobs: Optional[LogProbConfig] = None
class SystemMessageBehavior(Enum):
"""Config for how to override the default system prompt.
:cvar append: Appends the provided system message to the default system prompt:
https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2/#-function-definitions-in-the-system-prompt-
:cvar replace: Replaces the default system prompt with the provided system message. The system message can include the string
'{{function_definitions}}' to indicate where the function definitions should be inserted.
"""
append = "append"
replace = "replace"
@json_schema_type
class BatchCompletionResponse(BaseModel):
"""Batch completion response."""
class ToolConfig(BaseModel):
"""Configuration for tool use.
batch: List[CompletionResponse]
:param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
:param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
- `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
:param system_message_behavior: (Optional) Config for how to override the default system prompt.
- `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt.
- `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string
'{{function_definitions}}' to indicate where the function definitions should be inserted.
"""
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
system_message_behavior: SystemMessageBehavior = Field(default=SystemMessageBehavior.append)
# This is an internally used class
@json_schema_type
class ChatCompletionRequest(BaseModel):
model: str
messages: List[Message]
sampling_params: Optional[SamplingParams] = SamplingParams()
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
response_format: Optional[ResponseFormat] = None
tool_config: Optional[ToolConfig] = Field(default_factory=ToolConfig)
response_format: Optional[ResponseFormat] = None
stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class ChatCompletionResponseStreamChunk(BaseModel):
"""SSE-stream of these events."""
class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel):
"""A chunk of a streamed chat completion response.
:param event: The event containing the new content
"""
event: ChatCompletionResponseEvent
@json_schema_type
class ChatCompletionResponse(BaseModel):
"""Chat completion response."""
class ChatCompletionResponse(MetricResponseMixin, BaseModel):
"""Response from a chat completion request.
:param completion_message: The complete response message
:param logprobs: Optional log probabilities for generated tokens
"""
completion_message: CompletionMessage
logprobs: Optional[List[TokenLogProbs]] = None
@json_schema_type
class BatchChatCompletionRequest(BaseModel):
model: str
messages_batch: List[List[Message]]
sampling_params: Optional[SamplingParams] = SamplingParams()
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class BatchChatCompletionResponse(BaseModel):
batch: List[ChatCompletionResponse]
@json_schema_type
class EmbeddingsResponse(BaseModel):
"""Response containing generated embeddings.
:param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
embeddings: List[List[float]]
@ -292,6 +396,13 @@ class ModelStore(Protocol):
@runtime_checkable
@trace_protocol
class Inference(Protocol):
"""Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search.
"""
model_store: ModelStore
@webmethod(route="/inference/completion", method="POST")
@ -303,7 +414,19 @@ class Inference(Protocol):
response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
"""Generate a completion for the given content using the specified model.
:param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param content: The content to generate a completion for
:param sampling_params: (Optional) Parameters to control the sampling strategy
:param response_format: (Optional) Grammar specification for guided (structured) decoding
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
:param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
:returns: If stream=False, returns a CompletionResponse with the full completion.
If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
"""
...
@webmethod(route="/inference/chat-completion", method="POST")
async def chat_completion(
@ -311,20 +434,50 @@ class Inference(Protocol):
model_id: str,
messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(),
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None,
response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None,
) -> Union[
ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
]: ...
tool_config: Optional[ToolConfig] = None,
) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
"""Generate a chat completion for the given messages using the specified model.
:param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param messages: List of messages in the conversation
:param sampling_params: Parameters to control the sampling strategy
:param tools: (Optional) List of tool definitions available to the model
:param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
.. deprecated::
Use tool_config instead.
:param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
- `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
.. deprecated::
Use tool_config instead.
:param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
- `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
- `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
:param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
:param tool_config: (Optional) Configuration for tool use.
:returns: If stream=False, returns a ChatCompletionResponse with the full completion.
If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
"""
...
@webmethod(route="/inference/embeddings", method="POST")
async def embeddings(
self,
model_id: str,
contents: List[InterleavedContent],
) -> EmbeddingsResponse: ...
) -> EmbeddingsResponse:
"""Generate embeddings for content pieces using the specified model.
:param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
:param contents: List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text.
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
...

View file

@ -62,7 +62,7 @@ class Models(Protocol):
@webmethod(route="/models", method="GET")
async def list_models(self) -> ListModelsResponse: ...
@webmethod(route="/models/{model_id}", method="GET")
@webmethod(route="/models/{model_id:path}", method="GET")
async def get_model(
self,
model_id: str,
@ -78,7 +78,7 @@ class Models(Protocol):
model_type: Optional[ModelType] = None,
) -> Model: ...
@webmethod(route="/models/{model_id}", method="DELETE")
@webmethod(route="/models/{model_id:path}", method="DELETE")
async def unregister_model(
self,
model_id: str,

View file

@ -89,9 +89,7 @@ class QATFinetuningConfig(BaseModel):
AlgorithmConfig = register_schema(
Annotated[
Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")
],
Annotated[Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")],
name="AlgorithmConfig",
)
@ -204,14 +202,10 @@ class PostTraining(Protocol):
async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...
@webmethod(route="/post-training/job/status", method="GET")
async def get_training_job_status(
self, job_uuid: str
) -> Optional[PostTrainingJobStatusResponse]: ...
async def get_training_job_status(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]: ...
@webmethod(route="/post-training/job/cancel", method="POST")
async def cancel_training_job(self, job_uuid: str) -> None: ...
@webmethod(route="/post-training/job/artifacts", method="GET")
async def get_training_job_artifacts(
self, job_uuid: str
) -> Optional[PostTrainingJobArtifactsResponse]: ...
async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: ...

View file

@ -6,11 +6,9 @@
from enum import Enum
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field
@json_schema_type
class ResourceType(Enum):
model = "model"
shield = "shield"
@ -25,9 +23,7 @@ class ResourceType(Enum):
class Resource(BaseModel):
"""Base class for all Llama Stack resources"""
identifier: str = Field(
description="Unique identifier for this resource in llama stack"
)
identifier: str = Field(description="Unique identifier for this resource in llama stack")
provider_resource_id: str = Field(
description="Unique identifier for this resource in the provider",
@ -36,6 +32,4 @@ class Resource(BaseModel):
provider_id: str = Field(description="ID of the provider that owns this resource")
type: ResourceType = Field(
description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)"
)
type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)")

View file

@ -12,8 +12,8 @@ from typing import (
Literal,
Optional,
Protocol,
runtime_checkable,
Union,
runtime_checkable,
)
from llama_models.schema_utils import json_schema_type, register_schema, webmethod
@ -43,9 +43,7 @@ class AggregationFunctionType(Enum):
@json_schema_type
class LLMAsJudgeScoringFnParams(BaseModel):
type: Literal[ScoringFnParamsType.llm_as_judge.value] = (
ScoringFnParamsType.llm_as_judge.value
)
type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value
judge_model: str
prompt_template: Optional[str] = None
judge_score_regexes: Optional[List[str]] = Field(
@ -60,9 +58,7 @@ class LLMAsJudgeScoringFnParams(BaseModel):
@json_schema_type
class RegexParserScoringFnParams(BaseModel):
type: Literal[ScoringFnParamsType.regex_parser.value] = (
ScoringFnParamsType.regex_parser.value
)
type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value
parsing_regexes: Optional[List[str]] = Field(
description="Regex to extract the answer from generated response",
default_factory=list,
@ -112,9 +108,7 @@ class CommonScoringFnFields(BaseModel):
@json_schema_type
class ScoringFn(CommonScoringFnFields, Resource):
type: Literal[ResourceType.scoring_function.value] = (
ResourceType.scoring_function.value
)
type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
@property
def scoring_fn_id(self) -> str:
@ -140,10 +134,8 @@ class ScoringFunctions(Protocol):
@webmethod(route="/scoring-functions", method="GET")
async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...
@webmethod(route="/scoring-functions/{scoring_fn_id}", method="GET")
async def get_scoring_function(
self, scoring_fn_id: str, /
) -> Optional[ScoringFn]: ...
@webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ...
@webmethod(route="/scoring-functions", method="POST")
async def register_scoring_function(

View file

@ -48,7 +48,7 @@ class Shields(Protocol):
@webmethod(route="/shields", method="GET")
async def list_shields(self) -> ListShieldsResponse: ...
@webmethod(route="/shields/{identifier}", method="GET")
@webmethod(route="/shields/{identifier:path}", method="GET")
async def get_shield(self, identifier: str) -> Optional[Shield]: ...
@webmethod(route="/shields", method="POST")

View file

@ -5,11 +5,9 @@
# the root directory of this source tree.
from enum import Enum
from typing import Any, Dict, List, Optional, Protocol, Union
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel
from llama_stack.apis.inference import Message

View file

@ -13,10 +13,11 @@ from typing import (
Literal,
Optional,
Protocol,
runtime_checkable,
Union,
runtime_checkable,
)
from llama_models.llama3.api.datatypes import Primitive
from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field
from typing_extensions import Annotated
@ -76,7 +77,7 @@ class EventCommon(BaseModel):
trace_id: str
span_id: str
timestamp: datetime
attributes: Optional[Dict[str, Any]] = Field(default_factory=dict)
attributes: Optional[Dict[str, Primitive]] = Field(default_factory=dict)
@json_schema_type
@ -94,6 +95,30 @@ class MetricEvent(EventCommon):
unit: str
# This is a short term solution to allow inference API to return metrics
# The ideal way to do this is to have a way for all response types to include metrics
# and all metric events logged to the telemetry API to be inlcuded with the response
# To do this, we will need to augment all response types with a metrics field.
# We have hit a blocker from stainless SDK that prevents us from doing this.
# The blocker is that if we were to augment the response types that have a data field
# in them like so
# class ListModelsResponse(BaseModel):
# metrics: Optional[List[MetricEvent]] = None
# data: List[Models]
# ...
# The client SDK will need to access the data by using a .data field, which is not
# ergonomic. Stainless SDK does support unwrapping the response type, but it
# requires that the response type to only have a single field.
# We will need a way in the client SDK to signal that the metrics are needed
# and if they are needed, the client SDK has to return the full response type
# without unwrapping it.
class MetricResponseMixin(BaseModel):
metrics: Optional[List[MetricEvent]] = None
@json_schema_type
class StructuredLogType(Enum):
SPAN_START = "span_start"
@ -102,9 +127,7 @@ class StructuredLogType(Enum):
@json_schema_type
class SpanStartPayload(BaseModel):
type: Literal[StructuredLogType.SPAN_START.value] = (
StructuredLogType.SPAN_START.value
)
type: Literal[StructuredLogType.SPAN_START.value] = StructuredLogType.SPAN_START.value
name: str
parent_span_id: Optional[str] = None
@ -190,9 +213,7 @@ class QuerySpanTreeResponse(BaseModel):
@runtime_checkable
class Telemetry(Protocol):
@webmethod(route="/telemetry/events", method="POST")
async def log_event(
self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400
) -> None: ...
async def log_event(self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400) -> None: ...
@webmethod(route="/telemetry/traces", method="GET")
async def query_traces(
@ -203,13 +224,13 @@ class Telemetry(Protocol):
order_by: Optional[List[str]] = None,
) -> QueryTracesResponse: ...
@webmethod(route="/telemetry/traces/{trace_id}", method="GET")
@webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
async def get_trace(self, trace_id: str) -> Trace: ...
@webmethod(route="/telemetry/traces/{trace_id}/spans/{span_id}", method="GET")
@webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
async def get_span(self, trace_id: str, span_id: str) -> Span: ...
@webmethod(route="/telemetry/spans/{span_id}/tree", method="GET")
@webmethod(route="/telemetry/spans/{span_id:path}/tree", method="GET")
async def get_span_tree(
self,
span_id: str,

View file

@ -4,5 +4,5 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from .tools import * # noqa: F401 F403
from .rag_tool import * # noqa: F401 F403
from .tools import * # noqa: F401 F403

View file

@ -11,7 +11,7 @@ from llama_models.schema_utils import json_schema_type, register_schema, webmeth
from pydantic import BaseModel, Field
from typing_extensions import Annotated, Protocol, runtime_checkable
from llama_stack.apis.common.content_types import InterleavedContent, URL
from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -64,9 +64,7 @@ RAGQueryGeneratorConfig = register_schema(
class RAGQueryConfig(BaseModel):
# This config defines how a query is generated using the messages
# for memory bank retrieval.
query_generator_config: RAGQueryGeneratorConfig = Field(
default=DefaultRAGQueryGeneratorConfig()
)
query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
max_tokens_in_context: int = 4096
max_chunks: int = 5

View file

@ -11,7 +11,7 @@ from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field
from typing_extensions import Protocol, runtime_checkable
from llama_stack.apis.common.content_types import InterleavedContent, URL
from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -101,7 +101,7 @@ class ToolGroups(Protocol):
"""Register a tool group"""
...
@webmethod(route="/toolgroups/{toolgroup_id}", method="GET")
@webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET")
async def get_tool_group(
self,
toolgroup_id: str,
@ -117,13 +117,13 @@ class ToolGroups(Protocol):
"""List tools with optional tool group"""
...
@webmethod(route="/tools/{tool_name}", method="GET")
@webmethod(route="/tools/{tool_name:path}", method="GET")
async def get_tool(
self,
tool_name: str,
) -> Tool: ...
@webmethod(route="/toolgroups/{toolgroup_id}", method="DELETE")
@webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE")
async def unregister_toolgroup(
self,
toolgroup_id: str,
@ -150,8 +150,6 @@ class ToolRuntime(Protocol):
) -> List[ToolDef]: ...
@webmethod(route="/tool-runtime/invoke", method="POST")
async def invoke_tool(
self, tool_name: str, kwargs: Dict[str, Any]
) -> ToolInvocationResult:
async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
"""Run a tool with the given arguments"""
...

View file

@ -46,7 +46,7 @@ class VectorDBs(Protocol):
@webmethod(route="/vector-dbs", method="GET")
async def list_vector_dbs(self) -> ListVectorDBsResponse: ...
@webmethod(route="/vector-dbs/{vector_db_id}", method="GET")
@webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET")
async def get_vector_db(
self,
vector_db_id: str,
@ -62,5 +62,5 @@ class VectorDBs(Protocol):
provider_vector_db_id: Optional[str] = None,
) -> VectorDB: ...
@webmethod(route="/vector-dbs/{vector_db_id}", method="DELETE")
@webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE")
async def unregister_vector_db(self, vector_db_id: str) -> None: ...

View file

@ -16,11 +16,9 @@ from pathlib import Path
from typing import Dict, List, Optional
import httpx
from llama_models.datatypes import Model
from llama_models.sku_list import LlamaDownloadInfo
from pydantic import BaseModel, ConfigDict
from rich.console import Console
from rich.progress import (
BarColumn,
@ -147,9 +145,7 @@ class ParallelDownloader:
"follow_redirects": True,
}
async def retry_with_exponential_backoff(
self, task: DownloadTask, func, *args, **kwargs
):
async def retry_with_exponential_backoff(self, task: DownloadTask, func, *args, **kwargs):
last_exception = None
for attempt in range(task.max_retries):
try:
@ -166,13 +162,9 @@ class ParallelDownloader:
continue
raise last_exception
async def get_file_info(
self, client: httpx.AsyncClient, task: DownloadTask
) -> None:
async def get_file_info(self, client: httpx.AsyncClient, task: DownloadTask) -> None:
async def _get_info():
response = await client.head(
task.url, headers={"Accept-Encoding": "identity"}, **self.client_options
)
response = await client.head(task.url, headers={"Accept-Encoding": "identity"}, **self.client_options)
response.raise_for_status()
return response
@ -201,14 +193,10 @@ class ParallelDownloader:
return False
return os.path.getsize(task.output_file) == task.total_size
async def download_chunk(
self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int
) -> None:
async def download_chunk(self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int) -> None:
async def _download_chunk():
headers = {"Range": f"bytes={start}-{end}"}
async with client.stream(
"GET", task.url, headers=headers, **self.client_options
) as response:
async with client.stream("GET", task.url, headers=headers, **self.client_options) as response:
response.raise_for_status()
with open(task.output_file, "ab") as file:
@ -225,8 +213,7 @@ class ParallelDownloader:
await self.retry_with_exponential_backoff(task, _download_chunk)
except Exception as e:
raise DownloadError(
f"Failed to download chunk {start}-{end} after "
f"{task.max_retries} attempts: {str(e)}"
f"Failed to download chunk {start}-{end} after {task.max_retries} attempts: {str(e)}"
) from e
async def prepare_download(self, task: DownloadTask) -> None:
@ -244,9 +231,7 @@ class ParallelDownloader:
# Check if file is already downloaded
if os.path.exists(task.output_file):
if self.verify_file_integrity(task):
self.console.print(
f"[green]Already downloaded {task.output_file}[/green]"
)
self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
self.progress.update(task.task_id, completed=task.total_size)
return
@ -259,9 +244,7 @@ class ParallelDownloader:
current_pos = task.downloaded_size
while current_pos < task.total_size:
chunk_end = min(
current_pos + chunk_size - 1, task.total_size - 1
)
chunk_end = min(current_pos + chunk_size - 1, task.total_size - 1)
chunks.append((current_pos, chunk_end))
current_pos = chunk_end + 1
@ -273,18 +256,12 @@ class ParallelDownloader:
raise DownloadError(f"Download failed: {str(e)}") from e
except Exception as e:
self.progress.update(
task.task_id, description=f"[red]Failed: {task.output_file}[/red]"
)
raise DownloadError(
f"Download failed for {task.output_file}: {str(e)}"
) from e
self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
try:
total_remaining_size = sum(
task.total_size - task.downloaded_size for task in tasks
)
total_remaining_size = sum(task.total_size - task.downloaded_size for task in tasks)
dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file))
free_space = shutil.disk_usage(dir_path).free
@ -314,9 +291,7 @@ class ParallelDownloader:
with self.progress:
for task in tasks:
desc = f"Downloading {Path(task.output_file).name}"
task.task_id = self.progress.add_task(
desc, total=task.total_size, completed=task.downloaded_size
)
task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
@ -332,9 +307,7 @@ class ParallelDownloader:
if failed_tasks:
self.console.print("\n[red]Some downloads failed:[/red]")
for task, error in failed_tasks:
self.console.print(
f"[red]- {Path(task.output_file).name}: {error}[/red]"
)
self.console.print(f"[red]- {Path(task.output_file).name}: {error}[/red]")
raise DownloadError(f"{len(failed_tasks)} downloads failed")
@ -396,11 +369,7 @@ def _meta_download(
output_file = str(output_dir / f)
url = meta_url.replace("*", f"{info.folder}/{f}")
total_size = info.pth_size if "consolidated" in f else 0
tasks.append(
DownloadTask(
url=url, output_file=output_file, total_size=total_size, max_retries=3
)
)
tasks.append(DownloadTask(url=url, output_file=output_file, total_size=total_size, max_retries=3))
# Initialize and run parallel downloader
downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
@ -446,14 +415,10 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
os.makedirs(output_dir, exist_ok=True)
if any(output_dir.iterdir()):
console.print(
f"[yellow]Output directory {output_dir} is not empty.[/yellow]"
)
console.print(f"[yellow]Output directory {output_dir} is not empty.[/yellow]")
while True:
resp = input(
"Do you want to (C)ontinue download or (R)estart completely? (continue/restart): "
)
resp = input("Do you want to (C)ontinue download or (R)estart completely? (continue/restart): ")
if resp.lower() in ["restart", "r"]:
shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True)
@ -471,9 +436,7 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
]
# Initialize and run parallel downloader
downloader = ParallelDownloader(
max_concurrent_downloads=max_concurrent_downloads
)
downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
asyncio.run(downloader.download_all(tasks))

View file

@ -8,7 +8,6 @@ import argparse
import json
from llama_models.sku_list import resolve_model
from termcolor import colored
from llama_stack.cli.subcommand import Subcommand

View file

@ -38,7 +38,7 @@ class ModelList(Subcommand):
headers = [
"Model Descriptor",
"Hugging Face Repo",
"Model ID",
"Context Length",
]

View file

@ -11,7 +11,6 @@ from llama_stack.cli.model.download import ModelDownload
from llama_stack.cli.model.list import ModelList
from llama_stack.cli.model.prompt_format import ModelPromptFormat
from llama_stack.cli.model.verify_download import ModelVerifyDownload
from llama_stack.cli.subcommand import Subcommand
@ -26,6 +25,8 @@ class ModelParser(Subcommand):
description="Work with llama models",
)
self.parser.set_defaults(func=lambda args: self.parser.print_help())
subparsers = self.parser.add_subparsers(title="model_subcommands")
# Add sub-commands

View file

@ -8,7 +8,7 @@ import argparse
import textwrap
from io import StringIO
from llama_models.datatypes import CoreModelId, is_multimodal, model_family, ModelFamily
from llama_models.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
from llama_stack.cli.subcommand import Subcommand
@ -47,33 +47,20 @@ class ModelPromptFormat(Subcommand):
# Only Llama 3.1 and 3.2 are supported
supported_model_ids = [
m
for m in CoreModelId
if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
]
model_str = "\n".join([m.value for m in supported_model_ids])
try:
model_id = CoreModelId(args.model_name)
except ValueError:
self.parser.error(
f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}"
)
self.parser.error(f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}")
if model_id not in supported_model_ids:
self.parser.error(
f"{model_id} is not a valid Model. Choose one from --\n {model_str}"
)
self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}")
llama_3_1_file = (
importlib.resources.files("llama_models") / "llama3_1/prompt_format.md"
)
llama_3_2_text_file = (
importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md"
)
llama_3_2_vision_file = (
importlib.resources.files("llama_models")
/ "llama3_2/vision_prompt_format.md"
)
llama_3_1_file = importlib.resources.files("llama_models") / "llama3_1/prompt_format.md"
llama_3_2_text_file = importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md"
llama_3_2_vision_file = importlib.resources.files("llama_models") / "llama3_2/vision_prompt_format.md"
if model_family(model_id) == ModelFamily.llama3_1:
with importlib.resources.as_file(llama_3_1_file) as f:
content = f.open("r").read()

View file

@ -9,7 +9,6 @@ from typing import Any, Dict, Optional
from llama_models.datatypes import CheckpointQuantizationFormat
from llama_models.llama3.api.datatypes import SamplingParams
from llama_models.sku_list import LlamaDownloadInfo
from pydantic import BaseModel, ConfigDict, Field
@ -17,16 +16,12 @@ class PromptGuardModel(BaseModel):
"""Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""
model_id: str = "Prompt-Guard-86M"
description: str = (
"Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
)
description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
is_featured: bool = False
huggingface_repo: str = "meta-llama/Prompt-Guard-86M"
max_seq_length: int = 2048
is_instruct_model: bool = False
quantization_format: CheckpointQuantizationFormat = (
CheckpointQuantizationFormat.bf16
)
quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
arch_args: Dict[str, Any] = Field(default_factory=dict)
recommended_sampling_params: Optional[SamplingParams] = None

View file

@ -21,8 +21,12 @@ from prompt_toolkit.validation import Validator
from termcolor import cprint
from llama_stack.cli.table import print_table
from llama_stack.distribution.build import build_image, ImageType
from llama_stack.distribution.build import (
SERVER_DEPENDENCIES,
ImageType,
build_image,
get_provider_dependencies,
)
from llama_stack.distribution.datatypes import (
BuildConfig,
DistributionSpec,
@ -35,7 +39,6 @@ from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.providers.datatypes import Api
TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"
@ -52,9 +55,7 @@ def available_templates_specs() -> Dict[str, BuildConfig]:
return template_specs
def run_stack_build_command(
parser: argparse.ArgumentParser, args: argparse.Namespace
) -> None:
def run_stack_build_command(args: argparse.Namespace) -> None:
if args.list_templates:
return _run_template_list_cmd()
@ -74,18 +75,11 @@ def run_stack_build_command(
build_config.image_type = args.image_type
else:
cprint(
f"Please specify a image-type (docker | conda | venv) for {args.template}",
f"Please specify a image-type (container | conda | venv) for {args.template}",
color="red",
)
return
_run_stack_build_command_from_build_config(
build_config,
image_name=image_name,
template_name=args.template,
)
return
if not args.config and not args.template:
elif not args.config and not args.template:
name = prompt(
"> Enter a name for your Llama Stack (e.g. my-local-stack): ",
validator=Validator.from_callable(
@ -95,10 +89,10 @@ def run_stack_build_command(
)
image_type = prompt(
"> Enter the image type you want your Llama Stack to be built as (docker or conda or venv): ",
"> Enter the image type you want your Llama Stack to be built as (container or conda or venv): ",
validator=Validator.from_callable(
lambda x: x in ["docker", "conda", "venv"],
error_message="Invalid image type, please enter conda or docker or venv",
lambda x: x in ["container", "conda", "venv"],
error_message="Invalid image type, please enter conda or container or venv",
),
default="conda",
)
@ -132,11 +126,7 @@ def run_stack_build_command(
providers = dict()
for api, providers_for_api in get_provider_registry().items():
available_providers = [
x
for x in providers_for_api.keys()
if x not in ("remote", "remote::sample")
]
available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
api_provider = prompt(
"> Enter provider for API {}: ".format(api.value),
completer=WordCompleter(available_providers),
@ -159,9 +149,7 @@ def run_stack_build_command(
description=description,
)
build_config = BuildConfig(
image_type=image_type, distribution_spec=distribution_spec
)
build_config = BuildConfig(image_type=image_type, distribution_spec=distribution_spec)
else:
with open(args.config, "r") as f:
try:
@ -180,8 +168,20 @@ def run_stack_build_command(
)
return
if args.print_deps_only:
print(f"# Dependencies for {args.template or args.config or image_name}")
normal_deps, special_deps = get_provider_dependencies(build_config.distribution_spec.providers)
normal_deps += SERVER_DEPENDENCIES
print(f"uv pip install {' '.join(normal_deps)}")
for special_dep in special_deps:
print(f"uv pip install {special_dep}")
return
_run_stack_build_command_from_build_config(
build_config, image_name=image_name, config_path=args.config
build_config,
image_name=image_name,
config_path=args.config,
template_name=args.template,
)
@ -195,9 +195,7 @@ def _generate_run_config(
"""
apis = list(build_config.distribution_spec.providers.keys())
run_config = StackRunConfig(
container_image=(
image_name if build_config.image_type == ImageType.container.value else None
),
container_image=(image_name if build_config.image_type == ImageType.container.value else None),
image_name=image_name,
apis=apis,
providers={},
@ -217,13 +215,9 @@ def _generate_run_config(
if p.deprecation_error:
raise InvalidProviderError(p.deprecation_error)
config_type = instantiate_class_type(
provider_registry[Api(api)][provider_type].config_class
)
config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
if hasattr(config_type, "sample_run_config"):
config = config_type.sample_run_config(
__distro_dir__=f"distributions/{image_name}"
)
config = config_type.sample_run_config(__distro_dir__=f"distributions/{image_name}")
else:
config = {}
@ -258,9 +252,7 @@ def _run_stack_build_command_from_build_config(
image_name = f"distribution-{template_name}"
else:
if not image_name:
raise ValueError(
"Please specify an image name when building a docker image without a template"
)
raise ValueError("Please specify an image name when building a container image without a template")
elif build_config.image_type == ImageType.conda.value:
if not image_name:
raise ValueError("Please specify an image name when building a conda image")
@ -288,10 +280,7 @@ def _run_stack_build_command_from_build_config(
if template_name:
# copy run.yaml from template to build_dir instead of generating it again
template_path = (
importlib.resources.files("llama_stack")
/ f"templates/{template_name}/run.yaml"
)
template_path = importlib.resources.files("llama_stack") / f"templates/{template_name}/run.yaml"
with importlib.resources.as_file(template_path) as path:
run_config_file = build_dir / f"{template_name}-run.yaml"
shutil.copy(path, run_config_file)

View file

@ -63,10 +63,16 @@ environment is active, you must specify a name.
),
default=None,
)
self.parser.add_argument(
"--print-deps-only",
default=False,
action="store_true",
help="Print the dependencies for the stack only, without building the stack",
)
def _run_stack_build_command(self, args: argparse.Namespace) -> None:
# always keep implementation completely silo-ed away from CLI so CLI
# can be fast to load and reduces dependencies
from ._build import run_stack_build_command
return run_stack_build_command(self.parser, args)
return run_stack_build_command(args)

View file

@ -21,15 +21,19 @@ class StackListProviders(Subcommand):
self._add_arguments()
self.parser.set_defaults(func=self._run_providers_list_cmd)
def _add_arguments(self):
from llama_stack.distribution.datatypes import Api
@property
def providable_apis(self):
from llama_stack.distribution.distribution import providable_apis
api_values = [a.value for a in Api]
return [api.value for api in providable_apis()]
def _add_arguments(self):
self.parser.add_argument(
"api",
type=str,
choices=api_values,
help="API to list providers for (one of: {})".format(api_values),
choices=self.providable_apis,
nargs="?",
help="API to list providers for. List all if not specified.",
)
def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
@ -37,20 +41,29 @@ class StackListProviders(Subcommand):
from llama_stack.distribution.distribution import Api, get_provider_registry
all_providers = get_provider_registry()
providers_for_api = all_providers[Api(args.api)]
if args.api:
providers = [(args.api, all_providers[Api(args.api)])]
else:
providers = [(k.value, prov) for k, prov in all_providers.items()]
providers = [p for api, p in providers if api in self.providable_apis]
# eventually, this should query a registry at llama.meta.com/llamastack/distributions
headers = [
"API Type",
"Provider Type",
"PIP Package Dependencies",
]
rows = []
for spec in providers_for_api.values():
if spec.provider_type == "sample":
specs = [spec for p in providers for spec in p.values()]
for spec in specs:
if spec.is_sample:
continue
rows.append(
[
spec.api.value,
spec.provider_type,
",".join(spec.pip_packages),
]
@ -59,4 +72,5 @@ class StackListProviders(Subcommand):
rows,
headers,
separate_rows=True,
sort_by=(0, 1),
)

View file

@ -55,6 +55,23 @@ class StackRun(Subcommand):
default=[],
metavar="KEY=VALUE",
)
self.parser.add_argument(
"--tls-keyfile",
type=str,
help="Path to TLS key file for HTTPS",
)
self.parser.add_argument(
"--tls-certfile",
type=str,
help="Path to TLS certificate file for HTTPS",
)
self.parser.add_argument(
"--image-type",
type=str,
help="Image Type used during the build. This can be either conda or container or venv.",
choices=["conda", "container", "venv"],
default="conda",
)
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
import importlib.resources
@ -82,31 +99,21 @@ class StackRun(Subcommand):
if not config_file.exists() and not has_yaml_suffix:
# check if this is a template
config_file = (
Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
)
config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
if config_file.exists():
template_name = args.config
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to conda dir
config_file = Path(
BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml"
)
config_file = Path(BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml")
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to container dir
config_file = Path(
BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml"
)
config_file = Path(BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml")
if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir
config_file = Path(
DISTRIBS_BASE_DIR
/ f"llamastack-{args.config}"
/ f"{args.config}-run.yaml"
)
config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
if not config_file.exists():
self.parser.error(
@ -118,18 +125,11 @@ class StackRun(Subcommand):
config_dict = yaml.safe_load(config_file.read_text())
config = parse_and_maybe_upgrade_config(config_dict)
if config.container_image:
script = (
importlib.resources.files("llama_stack")
/ "distribution/start_container.sh"
)
image_name = (
f"distribution-{template_name}"
if template_name
else config.container_image
)
if args.image_type == ImageType.container.value or config.container_image:
script = importlib.resources.files("llama_stack") / "distribution/start_container.sh"
image_name = f"distribution-{template_name}" if template_name else config.container_image
run_args = [script, image_name]
else:
elif args.image_type == ImageType.conda.value:
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
image_name = args.image_name or current_conda_env
if not image_name:
@ -140,12 +140,12 @@ class StackRun(Subcommand):
return
def get_conda_prefix(env_name):
# Conda "base" environment does not end with "base" in the
# prefix, so should be handled separately.
if env_name == "base":
return os.environ.get("CONDA_PREFIX")
# Get conda environments info
conda_env_info = json.loads(
subprocess.check_output(
["conda", "info", "--envs", "--json"]
).decode()
)
conda_env_info = json.loads(subprocess.check_output(["conda", "info", "--envs", "--json"]).decode())
envs = conda_env_info["envs"]
for envpath in envs:
if envpath.endswith(env_name):
@ -169,14 +169,20 @@ class StackRun(Subcommand):
)
return
script = (
importlib.resources.files("llama_stack")
/ "distribution/start_conda_env.sh"
)
script = importlib.resources.files("llama_stack") / "distribution/start_conda_env.sh"
run_args = [
script,
image_name,
]
else:
# else must be venv since that is the only valid option left.
current_venv = os.environ.get("VIRTUAL_ENV")
venv = args.image_name or current_venv
script = importlib.resources.files("llama_stack") / "distribution/start_venv.sh"
run_args = [
script,
venv,
]
run_args.extend([str(config_file), str(args.port)])
if args.disable_ipv6:
@ -198,4 +204,7 @@ class StackRun(Subcommand):
return
run_args.extend(["--env", f"{key}={value}"])
if args.tls_keyfile and args.tls_certfile:
run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
run_with_pty(run_args)

View file

@ -31,6 +31,8 @@ class StackParser(Subcommand):
version=f"{version('llama-stack')}",
)
self.parser.set_defaults(func=lambda args: self.parser.print_help())
subparsers = self.parser.add_subparsers(title="stack_subcommands")
# Add sub-commands

View file

@ -6,6 +6,7 @@
import re
import textwrap
from typing import Iterable
from termcolor import cprint
@ -22,11 +23,7 @@ def format_row(row, col_widths):
if line.strip() == "":
lines.append("")
else:
lines.extend(
textwrap.wrap(
line, width, break_long_words=False, replace_whitespace=False
)
)
lines.extend(textwrap.wrap(line, width, break_long_words=False, replace_whitespace=False))
return lines
wrapped = [wrap(item, width) for item, width in zip(row, col_widths)]
@ -43,11 +40,15 @@ def format_row(row, col_widths):
return "\n".join(lines)
def print_table(rows, headers=None, separate_rows: bool = False):
def print_table(rows, headers=None, separate_rows: bool = False, sort_by: Iterable[int] = tuple()):
def itemlen(item):
return max([len(line) for line in strip_ansi_colors(item).split("\n")])
rows = [[x or "" for x in row] for row in rows]
if sort_by:
rows.sort(key=lambda x: tuple(x[i] for i in sort_by))
if not headers:
col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)]
else:

View file

@ -8,6 +8,7 @@ from datetime import datetime
import pytest
import yaml
from llama_stack.distribution.configure import (
LLAMA_STACK_RUN_CONFIG_VERSION,
parse_and_maybe_upgrade_config,
@ -41,9 +42,7 @@ def up_to_date_config():
- provider_id: provider1
provider_type: inline::meta-reference
config: {{}}
""".format(
version=LLAMA_STACK_RUN_CONFIG_VERSION, built_at=datetime.now().isoformat()
)
""".format(version=LLAMA_STACK_RUN_CONFIG_VERSION, built_at=datetime.now().isoformat())
)
@ -83,9 +82,7 @@ def old_config():
telemetry:
provider_type: noop
config: {{}}
""".format(
built_at=datetime.now().isoformat()
)
""".format(built_at=datetime.now().isoformat())
)
@ -108,10 +105,7 @@ def test_parse_and_maybe_upgrade_config_up_to_date(up_to_date_config):
def test_parse_and_maybe_upgrade_config_old_format(old_config):
result = parse_and_maybe_upgrade_config(old_config)
assert result.version == LLAMA_STACK_RUN_CONFIG_VERSION
assert all(
api in result.providers
for api in ["inference", "safety", "memory", "telemetry"]
)
assert all(api in result.providers for api in ["inference", "safety", "memory", "telemetry"])
safety_provider = result.providers["safety"][0]
assert safety_provider.provider_type == "meta-reference"
assert "llama_guard_shield" in safety_provider.config

Some files were not shown because too many files have changed in this diff Show more