Merge pull request #1 from meta-llama/main

Merging upstream changes
This commit is contained in:
cdgamarose-nv 2025-02-13 11:16:22 -08:00 committed by GitHub
commit eb1c5e86fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
389 changed files with 10041 additions and 7739 deletions

2
.github/CODEOWNERS vendored
View file

@ -2,4 +2,4 @@
# These owners will be the default owners for everything in # These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence, # the repo. Unless a later match takes precedence,
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 * @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan

View file

@ -1,6 +1,6 @@
name: 🐛 Bug Report name: 🐛 Bug Report
description: Create a report to help us reproduce and fix the bug description: Create a report to help us reproduce and fix the bug
labels: ["bug"]
body: body:
- type: markdown - type: markdown
attributes: attributes:

12
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View file

@ -0,0 +1,12 @@
blank_issues_enabled: false
contact_links:
- name: Have you read the docs?
url: https://llama-stack.readthedocs.io/en/latest/index.html
about: Much help can be found in the docs
- name: Start a discussion
url: https://github.com/meta-llama/llama-stack/discussions/new
about: Start a discussion on a topic
- name: Chat on Discord
url: https://discord.gg/llama-stack
about: Maybe chatting with the community can help

View file

@ -1,6 +1,6 @@
name: 🚀 Feature request name: 🚀 Feature request
description: Request a new llama-stack feature description: Request a new llama-stack feature
labels: ["enhancement"]
body: body:
- type: textarea - type: textarea
id: feature-pitch id: feature-pitch

View file

@ -1,27 +1,10 @@
# What does this PR do? # What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]
In short, provide a summary of what this PR does and why. Usually, the relevant context should be present in a linked issue. [//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])
- [ ] Addresses issue (#issue)
## Test Plan ## Test Plan
[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]
Please describe: [//]: # (## Documentation)
- tests you ran to verify your changes with result summaries.
- provide instructions so it can be reproduced.
## Sources
Please link relevant resources if necessary.
## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

View file

@ -11,10 +11,10 @@ jobs:
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 uses: actions/checkout@v4
- name: Set up Python - name: Set up Python
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 uses: actions/setup-python@v5
with: with:
python-version: '3.11' python-version: '3.11'
cache: pip cache: pip
@ -22,4 +22,8 @@ jobs:
**/requirements*.txt **/requirements*.txt
.pre-commit-config.yaml .pre-commit-config.yaml
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd #v3.0.1 - uses: pre-commit/action@v3.0.1
- name: Verify if there are any diff files after pre-commit
run: |
git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)

View file

@ -1,148 +0,0 @@
name: Docker Build and Publish
on:
workflow_dispatch:
inputs:
version:
description: 'TestPyPI or PyPI version to build (e.g., 0.0.63.dev20250114)'
required: true
type: string
jobs:
build-and-push:
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to the Container registry
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set version
id: version
run: |
if [ "${{ github.event_name }}" = "push" ]; then
echo "VERSION=0.0.63.dev51206766" >> $GITHUB_OUTPUT
else
echo "VERSION=${{ inputs.version }}" >> $GITHUB_OUTPUT
fi
- name: Check package version availability
run: |
# Function to check if version exists in a repository
check_version() {
local repo=$1
local VERSION_TO_CHECK=${{ steps.version.outputs.version }}
echo "Checking version $VERSION_TO_CHECK in $repo"
result=$(curl -s "https://$repo.org/pypi/llama-stack/json" | jq --arg v "$VERSION_TO_CHECK" '.releases | has($v)')
echo "Result: $result"
return $([ "$result" = "true" ])
}
# Check TestPyPI first, then PyPI
if check_version "test.pypi"; then
echo "Version ${{ steps.version.outputs.version }} found in TestPyPI"
echo "PYPI_SOURCE=testpypi" >> $GITHUB_ENV
elif check_version "pypi"; then
echo "Version ${{ steps.version.outputs.version }} found in PyPI"
echo "PYPI_SOURCE=pypi" >> $GITHUB_ENV
else
echo "Error: Version ${{ steps.version.outputs.version }} not found in either TestPyPI or PyPI"
exit 1
fi
- name: Install llama-stack
run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
if [ "${{ github.event_name }}" = "push" ]; then
pip install -e .
else
if [ "$PYPI_SOURCE" = "testpypi" ]; then
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple llama-stack==${{ steps.version.outputs.version }}
else
pip install llama-stack==${{ steps.version.outputs.version }}
fi
fi
- name: Build docker image
run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
echo "VERSION=${{ steps.version.outputs.version }}"
TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
for template in "${TEMPLATES[@]}"; do
if [ "$PYPI_SOURCE" = "testpypi" ]; then
TEST_PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
else
PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
fi
done
- name: List docker images
run: |
docker images
# TODO (xiyan): make the following 2 steps into a matrix and test all templates other than fireworks
- name: Start up built docker image
run: |
cd distributions/fireworks
if [ "$PYPI_SOURCE" = "testpypi" ]; then
sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:test-${{ steps.version.outputs.version }}|' ./compose.yaml
else
sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:${{ steps.version.outputs.version }}|' ./compose.yaml
fi
docker compose up -d
cd ..
# Wait for the container to start
timeout=300
while ! curl -s -f http://localhost:8321/v1/version > /dev/null && [ $timeout -gt 0 ]; do
echo "Waiting for endpoint to be available..."
sleep 5
timeout=$((timeout - 5))
done
if [ $timeout -le 0 ]; then
echo "Timeout waiting for endpoint to become available"
exit 1
fi
- name: Run simple models list test on docker server
run: |
curl http://localhost:8321/v1/models
# TODO (xiyan): figure out why client cannot find server but curl works
# - name: Run pytest on docker server
# run: |
# pip install pytest pytest-md-report
# export LLAMA_STACK_BASE_URL="http://localhost:8321"
# LLAMA_STACK_BASE_URL="http://localhost:8321" pytest -v tests/client-sdk/inference/test_inference.py --md-report --md-report-verbose=1
- name: Push to dockerhub
run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
echo "VERSION=${{ steps.version.outputs.version }}"
TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
for template in "${TEMPLATES[@]}"; do
if [ "$PYPI_SOURCE" = "testpypi" ]; then
docker tag distribution-$template:test-${{ steps.version.outputs.version }} llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
docker push llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
else
docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:${{ steps.version.outputs.version }}
docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:latest
docker push llamastack/distribution-$template:${{ steps.version.outputs.version }}
docker push llamastack/distribution-$template:latest
fi
done

View file

@ -1,244 +0,0 @@
name: Publish Python 🐍 distribution 📦 to TestPyPI
on:
workflow_dispatch: # Keep manual trigger
inputs:
version:
description: 'Version number (e.g. 0.0.63.dev20250111)'
required: true
type: string
schedule:
- cron: "0 0 * * *" # Run every day at midnight
jobs:
trigger-client-and-models-build:
name: Trigger llama-stack-client and llama-models build
runs-on: ubuntu-latest
outputs:
version: ${{ steps.version.outputs.version }}
client_run_id: ${{ steps.trigger-client.outputs.workflow_id }}
model_run_id: ${{ steps.trigger-models.outputs.workflow_id }}
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Get date
id: date
run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
- name: Compute version based on dispatch event
id: version
run: |
# Read base version from pyproject.toml
version=$(sed -n 's/.*version="\([^"]*\)".*/\1/p' setup.py)
if [ "${{ github.event_name }}" = "schedule" ]; then
echo "version=${version}.dev${{ steps.date.outputs.date }}" >> $GITHUB_OUTPUT
elif [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "version=${{ inputs.version }}" >> $GITHUB_OUTPUT
else
echo "version=${version}.dev$(shuf -i 10000000-99999999 -n 1)" >> $GITHUB_OUTPUT
fi
- name: Trigger llama-stack-client workflow
id: trigger-client
run: |
response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-stack-client-python/dispatches \
-H 'Accept: application/vnd.github.everest-preview+json' \
-H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
--data "{\"event_type\": \"build-client-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
-w "\n%{http_code}")
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" != "204" ]; then
echo "Failed to trigger client workflow"
exit 1
fi
# Get the run ID of the triggered workflow
sleep 5 # Wait for workflow to be created
run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs?event=repository_dispatch" \
| jq '.workflow_runs[0].id')
echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
- name: Trigger llama-models workflow
id: trigger-models
run: |
response=$(curl -X POST https://api.github.com/repos/meta-llama/llama-models/dispatches \
-H 'Accept: application/vnd.github.everest-preview+json' \
-H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
--data "{\"event_type\": \"build-models-package\", \"client_payload\": {\"source\": \"llama-stack-nightly\", \"version\": \"${{ steps.version.outputs.version }}\"}}" \
-w "\n%{http_code}")
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" != "204" ]; then
echo "Failed to trigger models workflow"
exit 1
fi
# Get the run ID of the triggered workflow
sleep 5 # Wait for workflow to be created
run_id=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-models/actions/runs?event=repository_dispatch" \
| jq '.workflow_runs[0].id')
echo "workflow_id=$run_id" >> $GITHUB_OUTPUT
wait-for-workflows:
name: Wait for triggered workflows
needs: trigger-client-and-models-build
runs-on: ubuntu-latest
steps:
- name: Wait for client workflow
run: |
while true; do
status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
| jq -r '.status')
conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-stack-client-python/actions/runs/${{ needs.trigger-client-and-models-build.outputs.client_run_id }}" \
| jq -r '.conclusion')
echo "llama-stack-client-python workflow status: $status, conclusion: $conclusion"
if [ "$status" = "completed" ]; then
if [ "$conclusion" != "success" ]; then
echo "llama-stack-client-python workflow failed"
exit 1
fi
break
fi
sleep 10
done
- name: Wait for models workflow
run: |
while true; do
status=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
| jq -r '.status')
conclusion=$(curl -s -H "authorization: Bearer ${{ secrets.PAT_TOKEN }}" \
"https://api.github.com/repos/meta-llama/llama-models/actions/runs/${{ needs.trigger-client-and-models-build.outputs.model_run_id }}" \
| jq -r '.conclusion')
echo "llama-models workflow status: $status, conclusion: $conclusion"
if [ "$status" = "completed" ]; then
if [ "$conclusion" != "success" ]; then
echo "llama-models workflow failed"
exit 1
fi
break
fi
sleep 10
done
build:
name: Build distribution 📦
needs:
- wait-for-workflows
- trigger-client-and-models-build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Get date
id: date
run: echo "date=$(date +'%Y%m%d')" >> $GITHUB_OUTPUT
- name: Update version for nightly
run: |
sed -i 's/version="\([^"]*\)"/version="${{ needs.trigger-client-and-models-build.outputs.version }}"/' setup.py
sed -i 's/llama-stack-client>=\([^"]*\)/llama-stack-client==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
sed -i 's/llama-models>=\([^"]*\)/llama-models==${{ needs.trigger-client-and-models-build.outputs.version }}/' requirements.txt
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install pypa/build
run: >-
python3 -m
pip install
build
--user
- name: Build a binary wheel and a source tarball
run: python3 -m build
- name: Store the distribution packages
uses: actions/upload-artifact@v4
with:
name: python-package-distributions
path: dist/
publish-to-testpypi:
name: Publish Python 🐍 distribution 📦 to TestPyPI
needs:
- build
runs-on: ubuntu-latest
environment:
name: testrelease
url: https://test.pypi.org/p/llama-stack
permissions:
id-token: write # IMPORTANT: mandatory for trusted publishing
steps:
- name: Download all the dists
uses: actions/download-artifact@v4
with:
name: python-package-distributions
path: dist/
- name: Publish distribution 📦 to TestPyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
repository-url: https://test.pypi.org/legacy/
test-published-package:
name: Test published package
needs:
- publish-to-testpypi
- trigger-client-and-models-build
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
steps:
- uses: actions/checkout@v4
with:
persist-credentials: false
- name: Install the package
run: |
max_attempts=6
attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Attempt $attempt of $max_attempts to install package..."
if pip install --no-cache --index-url https://pypi.org/simple/ --extra-index-url https://test.pypi.org/simple/ llama-stack==${{ needs.trigger-client-and-models-build.outputs.version }}; then
echo "Package installed successfully"
break
fi
if [ $attempt -ge $max_attempts ]; then
echo "Failed to install package after $max_attempts attempts"
exit 1
fi
attempt=$((attempt + 1))
sleep 10
done
- name: Test the package versions
run: |
pip list | grep llama_
- name: Test CLI commands
run: |
llama model list
llama stack build --list-templates
llama model prompt-format -m Llama3.2-11B-Vision-Instruct
llama stack list-apis
llama stack list-providers inference
llama stack list-providers telemetry
- name: Test Notebook
run: |
pip install pytest nbval
llama stack build --template together --image-type venv
pytest -v -s --nbval-lax ./docs/getting_started.ipynb
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
# TODO: add trigger for integration test workflow & docker builds

21
.github/workflows/semantic-pr.yml vendored Normal file
View file

@ -0,0 +1,21 @@
name: Check semantic PR titles
on:
pull_request_target:
types:
- opened
- edited
- reopened
- synchronize
permissions:
contents: read
jobs:
title-check:
runs-on: ubuntu-latest
steps:
- name: Check PR Title's semantic conformance
uses: amannn/action-semantic-pull-request@v5
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

69
.github/workflows/tests.yml vendored Normal file
View file

@ -0,0 +1,69 @@
name: auto-tests
on:
# pull_request:
workflow_dispatch:
inputs:
commit_sha:
description: 'Specific Commit SHA to trigger on'
required: false
default: $GITHUB_SHA # default to the last commit of $GITHUB_REF branch
jobs:
test-llama-stack-as-library:
runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
strategy:
matrix:
provider: [fireworks, together]
steps:
- uses: actions/checkout@v4
with:
ref: ${{ github.event.inputs.commit_sha }}
- name: Echo commit SHA
run: |
echo "Triggered on commit SHA: ${{ github.event.inputs.commit_sha }}"
git rev-parse HEAD
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt pytest
pip install -e .
- name: Build providers
run: |
llama stack build --template ${{ matrix.provider }} --image-type venv
- name: Install the latest llama-stack-client & llama-models packages
run: |
pip install -e git+https://github.com/meta-llama/llama-stack-client-python.git#egg=llama-stack-client
pip install -e git+https://github.com/meta-llama/llama-models.git#egg=llama-models
- name: Run client-sdk test
working-directory: "${{ github.workspace }}"
env:
REPORT_OUTPUT: md_report.md
shell: bash
run: |
pip install --upgrade pytest-md-report
echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
- name: Output reports to the job summary
if: always()
shell: bash
run: |
if [ -f "$REPORT_FILE" ]; then
echo "<details><summary> Test Report for ${{ matrix.provider }} </summary>" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
cat "$REPORT_FILE" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "</details>" >> $GITHUB_STEP_SUMMARY
fi

View file

@ -0,0 +1,40 @@
name: Update ReadTheDocs
on:
workflow_dispatch:
inputs:
branch:
description: 'RTD version to update'
required: false
default: 'latest'
push:
branches:
- main
paths:
- 'docs/source/**'
- 'docs/resources/**'
- '.github/workflows/update-readthedocs.yml'
jobs:
update-readthedocs:
runs-on: ubuntu-latest
env:
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
steps:
- name: Trigger ReadTheDocs build
run: |
if [ -z "$TOKEN" ]; then
echo "READTHEDOCS_TOKEN is not set"
exit 1
fi
response=$(curl -X POST \
-H "Content-Type: application/json" \
-d "{\"token\": \"$TOKEN\"}" \
https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
echo "Response: $response"
if [ $(echo $response | jq -r '.build_triggered') != 'true' ]; then
echo "Failed to trigger ReadTheDocs build"
exit 1
fi

1
.gitignore vendored
View file

@ -19,3 +19,4 @@ Package.resolved
_build _build
docs/src docs/src
pyrightconfig.json pyrightconfig.json
venv/

View file

@ -5,10 +5,8 @@ default_language_version:
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: 6306a48f7dae5861702d573c9c247e4e9498e867 rev: v5.0.0 # Latest stable version
hooks: hooks:
- id: trailing-whitespace
- id: check-ast
- id: check-merge-conflict - id: check-merge-conflict
- id: check-added-large-files - id: check-added-large-files
args: ['--maxkb=1000'] args: ['--maxkb=1000']
@ -28,23 +26,41 @@ repos:
- --license-filepath - --license-filepath
- docs/license_header.txt - docs/license_header.txt
- repo: https://github.com/pycqa/flake8 - repo: https://github.com/astral-sh/ruff-pre-commit
rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b rev: v0.9.4
hooks: hooks:
- id: flake8 # Run the linter with import sorting.
additional_dependencies: - id: ruff
- flake8-bugbear == 22.4.25 args: [
- pep8-naming == 0.12.1 --fix,
- torchfix --exit-non-zero-on-fix,
args: ['--config=.flake8'] --select, I,
]
- id: ruff-format
- repo: https://github.com/omnilib/ufmt - repo: https://github.com/adamchainz/blacken-docs
rev: v2.7.0 rev: 1.19.0
hooks: hooks:
- id: ufmt - id: blacken-docs
additional_dependencies: additional_dependencies:
- black == 24.4.2 - black==24.3.0
- usort == 1.0.8
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.5.26
hooks:
- id: uv-export
args: ["--frozen", "--no-hashes", "--no-emit-project"]
- id: uv-sync
# - repo: https://github.com/pre-commit/mirrors-mypy
# rev: v1.14.0
# hooks:
# - id: mypy
# additional_dependencies:
# - types-requests
# - types-setuptools
# - pydantic
# args: [--ignore-missing-imports]
# - repo: https://github.com/jsh9/pydoclint # - repo: https://github.com/jsh9/pydoclint
# rev: d88180a8632bb1602a4d81344085cf320f288c5a # rev: d88180a8632bb1602a4d81344085cf320f288c5a
@ -71,3 +87,7 @@ repos:
# require_serial: true # require_serial: true
# files: ^llama_stack/templates/.*$ # files: ^llama_stack/templates/.*$
# stages: [manual] # stages: [manual]
ci:
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
autoupdate_commit_msg: ⬆ [pre-commit.ci] pre-commit autoupdate

View file

@ -1,7 +1,8 @@
[flake8]
# Suggested config from pytorch that we can adapt # Suggested config from pytorch that we can adapt
select = B,C,E,F,N,P,T4,W,B9,TOR0,TOR1,TOR2 lint.select = ["B", "C", "E" , "F" , "N", "W", "B9"]
max-line-length = 120
line-length = 120
# C408 ignored because we like the dict keyword argument syntax # C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead # E501 is not flexible enough, we're using B950 instead
# N812 ignored because import torch.nn.functional as F is PyTorch convention # N812 ignored because import torch.nn.functional as F is PyTorch convention
@ -9,23 +10,28 @@ max-line-length = 120
# E731 allow usage of assigning lambda expressions # E731 allow usage of assigning lambda expressions
# E701 let black auto-format statements on one line # E701 let black auto-format statements on one line
# E704 let black auto-format statements on one line # E704 let black auto-format statements on one line
ignore = lint.ignore = [
E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,E701,E704 "E203", "E305", "E402", "E501", "E721", "E741", "F405", "F821", "F841",
"C408", "E302", "W291", "E303", "N812", "N817", "E731", "E701",
# These are the additional ones we started ignoring after moving to ruff. We should look into each one of them later.
"C901", "C405", "C414", "N803", "N999", "C403", "C416", "B028", "C419", "C401", "B023",
# shebang has extra meaning in fbcode lints, so I think it's not worth trying # shebang has extra meaning in fbcode lints, so I think it's not worth trying
# to line this up with executable bit # to line this up with executable bit
EXE001, "EXE001",
# random naming hints don't need # random naming hints don't need
N802, "N802",
# these ignores are from flake8-bugbear; please fix! # these ignores are from flake8-bugbear; please fix!
B007,B008,B950 "B007", "B008"
optional-ascii-coding = True ]
exclude =
./.git, exclude = [
./docs/*, "./.git",
./build, "./docs/*",
./scripts, "./build",
./venv, "./scripts",
*.pyi, "./venv",
.pre-commit-config.yaml, "*.pyi",
*.md, ".pre-commit-config.yaml",
.flake8 "*.md",
".flake8"
]

View file

@ -1,35 +0,0 @@
# Changelog
## 0.0.53
### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
- Persistence for registered objects with distribution
- Ability to persist memory banks created for FAISS
- PostgreSQL KVStore implementation
- Environment variable placeholder support in run.yaml files
- Comprehensive Zero-to-Hero notebooks and quickstart guides
- Support for quantized models in Ollama
- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
- Bedrock distribution with safety shields support
- Evals API with task registration and scoring functions
- MMLU and SimpleQA benchmark scoring functions
- Huggingface dataset provider integration for benchmarks
- Support for custom dataset registration from local paths
- Benchmark evaluation CLI tools with visualization tables
- RAG evaluation scoring functions and metrics
- Local persistence for datasets and eval tasks
### Changed
- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
- Changed provider naming convention (`impls``inline`, `adapters``remote`)
- Updated API signatures for dataset and eval task registration
- Restructured folder organization for providers
- Enhanced Docker build configuration
- Added version prefixing for REST API routes
- Enhanced evaluation task registration workflow
- Improved benchmark evaluation output formatting
- Restructured evals folder organization for better modularity
### Removed
- `llama stack configure` command

View file

@ -40,6 +40,7 @@ If you need help or guidance, comment on the issue. Issues that are extra friend
3. Ensure the test suite passes. 3. Ensure the test suite passes.
4. Make sure your code lints using `pre-commit`. 4. Make sure your code lints using `pre-commit`.
5. If you haven't already, complete the Contributor License Agreement ("CLA"). 5. If you haven't already, complete the Contributor License Agreement ("CLA").
6. Ensure your pull request follows the [conventional commits format](https://www.conventionalcommits.org/en/v1.0.0/).
## Contributor License Agreement ("CLA") ## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need In order to accept your pull request, we need you to submit a CLA. You only need
@ -56,22 +57,50 @@ disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue. outlined on that page and do not file a public issue.
## Set up your development environment
We use [uv](https://github.com/astral-sh/uv) to manage python dependencies and virtual environments.
You can install `uv` by following this [guide](https://docs.astral.sh/uv/getting-started/installation/).
You can install the dependencies by running:
```bash
$ cd llama-stack
$ uv sync --extra dev
$ uv pip install -e .
$ source .venv/bin/activate
```
## Pre-commit Hooks ## Pre-commit Hooks
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running: We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash ```bash
$ cd llama-stack $ uv run pre-commit install
$ conda activate <your-environment>
$ pip install pre-commit
$ pre-commit install
``` ```
After that, pre-commit hooks will run automatically before each commit. After that, pre-commit hooks will run automatically before each commit.
Alternatively, if you don't want to install the pre-commit hooks, you can run the checks manually by running:
```bash
$ uv run pre-commit run --all-files
```
> [!CAUTION]
> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
## Adding a new dependency to the project
To add a new dependency to the project, you can use the `uv` command. For example, to add `foo` to the project, you can run:
```bash
$ uv add foo
$ uv sync
```
## Coding Style ## Coding Style
* 2 spaces for indentation rather than tabs
* 4 spaces for indentation rather than tabs
* 80 character line length * 80 character line length
* ... * ...
@ -102,13 +131,12 @@ If you have made changes to a provider's configuration in any form (introducing
If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme. If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
```bash ```bash
cd llama-stack/docs $ cd llama-stack/docs
pip install -r requirements.txt $ uv sync --extra docs
pip install sphinx-autobuild
# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation. # This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
make html $ make html
sphinx-autobuild source build/html $ uv run sphinx-autobuild source build/html
``` ```

View file

@ -1,4 +1,4 @@
include requirements.txt include pyproject.toml
include distributions/dependencies.json include distributions/dependencies.json
include llama_stack/distribution/*.sh include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh include llama_stack/cli/scripts/*.sh

View file

@ -2,17 +2,18 @@
[![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/) [![PyPI version](https://img.shields.io/pypi/v/llama_stack.svg)](https://pypi.org/project/llama_stack/)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
[![License](https://img.shields.io/pypi/l/llama_stack.svg)](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack) [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
Llama Stack defines and standardizes the core building blocks that simplify AI application development. It codified best practices across the Llama ecosystem. More specifically, it provides Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry. - **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile. - **Plugin architecture** to support the rich ecosystem of different API implementations in various environments, including local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment - **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment.
- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android - **Multiple developer interfaces** like CLI and SDKs for Python, Typescript, iOS, and Android.
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack - **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack.
<div style="text-align: center;"> <div style="text-align: center;">
<img <img
@ -24,31 +25,31 @@ Llama Stack defines and standardizes the core building blocks that simplify AI a
</div> </div>
### Llama Stack Benefits ### Llama Stack Benefits
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choice. - **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choices.
- **Consistent Experience**: With its unified APIs Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior. - **Consistent Experience**: With its unified APIs, Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
- **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models. - **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.
By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications. By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.
### API Providers ### API Providers
Here is a list of the various API providers and available distributions to developers started easily, Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | | **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:| |:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
| Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ |
| SambaNova | Hosted | | :heavy_check_mark: | | | | | SambaNova | Hosted | | ✅ | | | |
| Cerebras | Hosted | | :heavy_check_mark: | | | | | Cerebras | Hosted | | ✅ | | | |
| Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | Fireworks | Hosted | ✅ | ✅ | ✅ | | |
| AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | | | AWS Bedrock | Hosted | | ✅ | | ✅ | |
| Together | Hosted | :heavy_check_mark: | :heavy_check_mark: | | :heavy_check_mark: | | | Together | Hosted | ✅ | ✅ | | ✅ | |
| Groq | Hosted | | :heavy_check_mark: | | | | | Groq | Hosted | | ✅ | | | |
| Ollama | Single Node | | :heavy_check_mark: | | | | | Ollama | Single Node | | ✅ | | | |
| TGI | Hosted and Single Node | | :heavy_check_mark: | | | | | TGI | Hosted and Single Node | | ✅ | | | |
| NVIDIA NIM | Hosted and Single Node | | :heavy_check_mark: | | | | | NVIDIA NIM | Hosted and Single Node | | ✅ | | | |
| Chroma | Single Node | | | :heavy_check_mark: | | | | Chroma | Single Node | | | ✅ | | |
| PG Vector | Single Node | | | :heavy_check_mark: | | | | PG Vector | Single Node | | | ✅ | | |
| PyTorch ExecuTorch | On-device iOS | :heavy_check_mark: | :heavy_check_mark: | | | | | PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | |
| vLLM | Hosted and Single Node | | :heavy_check_mark: | | | | | vLLM | Hosted and Single Node | | ✅ | | | |
### Distributions ### Distributions
@ -70,15 +71,15 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
You have two ways to install this repository: You have two ways to install this repository:
1. **Install as a package**: * **Install as a package**:
You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command: You can install the repository directly from [PyPI](https://pypi.org/project/llama-stack/) by running the following command:
```bash ```bash
pip install llama-stack pip install llama-stack
``` ```
2. **Install from source**: * **Install from source**:
If you prefer to install from the source code, make sure you have [conda installed](https://docs.conda.io/projects/conda/en/stable). If you prefer to install from the source code, make sure you have [conda installed](https://docs.conda.io/projects/conda/en/stable).
Then, follow these steps: Then, run the following commands:
```bash ```bash
mkdir -p ~/local mkdir -p ~/local
cd ~/local cd ~/local
@ -95,10 +96,11 @@ You have two ways to install this repository:
Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details. Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
* [CLI reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html) * CLI references
* Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution. * [llama (server-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_cli_reference/index.html): Guide for using the `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
* [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) * [llama (client-side) CLI Reference](https://llama-stack.readthedocs.io/en/latest/references/llama_stack_client_cli_reference.html): Guide for using the `llama-stack-client` CLI, which allows you to query information about the distribution.
* Quick guide to start a Llama Stack server. * Getting Started
* [Quick guide to start a Llama Stack server](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html).
* [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
* The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack). * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
* A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples. * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
@ -111,9 +113,9 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
| :----: | :----: | :----: | | :----: | :----: | :----: |
| Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/) | Python | [llama-stack-client-python](https://github.com/meta-llama/llama-stack-client-python) | [![PyPI version](https://img.shields.io/pypi/v/llama_stack_client.svg)](https://pypi.org/project/llama_stack_client/)
| Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift) | Swift | [llama-stack-client-swift](https://github.com/meta-llama/llama-stack-client-swift) | [![Swift Package Index](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2Fmeta-llama%2Fllama-stack-client-swift%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/meta-llama/llama-stack-client-swift)
| Node | [llama-stack-client-node](https://github.com/meta-llama/llama-stack-client-node) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client) | Typescript | [llama-stack-client-typescript](https://github.com/meta-llama/llama-stack-client-typescript) | [![NPM version](https://img.shields.io/npm/v/llama-stack-client.svg)](https://npmjs.org/package/llama-stack-client)
| Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin) | Kotlin | [llama-stack-client-kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) | [![Maven version](https://img.shields.io/maven-central/v/com.llama.llamastack/llama-stack-client-kotlin)](https://central.sonatype.com/artifact/com.llama.llamastack/llama-stack-client-kotlin)
Check out our client SDKs for connecting to Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [node](https://github.com/meta-llama/llama-stack-client-node), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications. Check out our client SDKs for connecting to a Llama Stack server in your preferred language, you can choose from [python](https://github.com/meta-llama/llama-stack-client-python), [typescript](https://github.com/meta-llama/llama-stack-client-typescript), [swift](https://github.com/meta-llama/llama-stack-client-swift), and [kotlin](https://github.com/meta-llama/llama-stack-client-kotlin) programming languages to quickly build your applications.
You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo. You can find more example scripts with client SDKs to talk with the Llama Stack server in our [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main/examples) repo.

View file

@ -1,9 +1,46 @@
{ {
"sambanova": [ "bedrock": [
"aiosqlite", "aiosqlite",
"autoevals",
"blobfile", "blobfile",
"boto3",
"chardet", "chardet",
"chromadb-client", "chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"cerebras": [
"aiosqlite",
"autoevals",
"blobfile",
"cerebras_cloud_sdk",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu", "faiss-cpu",
"fastapi", "fastapi",
"fire", "fire",
@ -27,7 +64,110 @@
"transformers", "transformers",
"uvicorn", "uvicorn",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"dell": [
"aiohttp",
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"fireworks": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"hf-endpoint": [
"aiohttp",
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
], ],
"hf-serverless": [ "hf-serverless": [
"aiohttp", "aiohttp",
@ -62,211 +202,7 @@
"transformers", "transformers",
"uvicorn", "uvicorn",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"together": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"together",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"vllm-gpu": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"vllm",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"remote-vllm": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"fireworks": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"tgi": [
"aiohttp",
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"bedrock": [
"aiosqlite",
"autoevals",
"blobfile",
"boto3",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
], ],
"meta-reference-gpu": [ "meta-reference-gpu": [
"accelerate", "accelerate",
@ -306,39 +242,7 @@
"uvicorn", "uvicorn",
"zmq", "zmq",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"nvidia": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
], ],
"meta-reference-quantized-gpu": [ "meta-reference-quantized-gpu": [
"accelerate", "accelerate",
@ -380,21 +284,20 @@
"uvicorn", "uvicorn",
"zmq", "zmq",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
], ],
"cerebras": [ "nvidia": [
"aiosqlite", "aiosqlite",
"autoevals", "autoevals",
"blobfile", "blobfile",
"cerebras_cloud_sdk",
"chardet", "chardet",
"chromadb-client",
"datasets", "datasets",
"faiss-cpu", "faiss-cpu",
"fastapi", "fastapi",
"fire", "fire",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -413,7 +316,7 @@
"transformers", "transformers",
"uvicorn", "uvicorn",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
], ],
"ollama": [ "ollama": [
"aiohttp", "aiohttp",
@ -447,9 +350,72 @@
"transformers", "transformers",
"uvicorn", "uvicorn",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
], ],
"hf-endpoint": [ "remote-vllm": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"sambanova": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"tgi": [
"aiohttp", "aiohttp",
"aiosqlite", "aiosqlite",
"autoevals", "autoevals",
@ -482,6 +448,74 @@
"transformers", "transformers",
"uvicorn", "uvicorn",
"sentence-transformers --no-deps", "sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu" "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"together": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"together",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
],
"vllm-gpu": [
"aiosqlite",
"autoevals",
"blobfile",
"chardet",
"chromadb-client",
"datasets",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"mcp",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"vllm",
"sentence-transformers --no-deps",
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
] ]
} }

View file

@ -1,65 +0,0 @@
# Together Distribution
### Connect to a Llama Stack Together Endpoint
- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
The `llamastack/distribution-together` distribution consists of the following provider configurations.
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
| **Provider(s)** | remote::together | meta-reference | meta-reference, remote::weaviate | meta-reference | meta-reference |
### Docker: Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have an hosted endpoint at Together with API Key.
```
$ cd distributions/together
$ ls
compose.yaml run.yaml
$ docker compose up
```
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
```
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: <optional api key>
```
### Conda llama stack run (Single Node CPU)
```bash
llama stack build --template together --image-type conda
# -- modify run.yaml to a valid Together server endpoint
llama stack run ./run.yaml
```
### (Optional) Update Model Serving Configuration
Use `llama-stack-client models list` to check the available models served by together.
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
```

View file

@ -12,3 +12,7 @@
.wy-side-nav-search { .wy-side-nav-search {
background-color: transparent !important; background-color: transparent !important;
} }
.hide-title h1 {
display: none;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

9
docs/conftest.py Normal file
View file

@ -0,0 +1,9 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
def pytest_collection_modifyitems(items):
for item in items:
item.name = item.name.replace(' ', '_')

View file

@ -7,7 +7,7 @@
"id": "c1e7571c" "id": "c1e7571c"
}, },
"source": [ "source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1F2ksmkoGQPa4pzRjMOE6BXWeOxWFIW6n?usp=sharing)\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)\n",
"\n", "\n",
"# Llama Stack - Building AI Applications\n", "# Llama Stack - Building AI Applications\n",
"\n", "\n",
@ -15,7 +15,7 @@
"\n", "\n",
"[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n", "[Llama Stack](https://github.com/meta-llama/llama-stack) defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.\n",
"\n", "\n",
"Read more about the project: https://llama-stack.readthedocs.io/en/latest/index.html\n", "Read more about the project here: https://llama-stack.readthedocs.io/en/latest/index.html\n",
"\n", "\n",
"In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n" "In this guide, we will showcase how you can build LLM-powered agentic applications using Llama Stack.\n"
] ]
@ -71,7 +71,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"id": "J2kGed0R5PSf", "id": "J2kGed0R5PSf",
"metadata": { "metadata": {
"colab": { "colab": {
@ -81,119 +81,15 @@
"id": "J2kGed0R5PSf", "id": "J2kGed0R5PSf",
"outputId": "2478ea60-8d35-48a1-b011-f233831740c5" "outputId": "2478ea60-8d35-48a1-b011-f233831740c5"
}, },
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" bubblewrap\n",
"0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.\n",
"Need to get 46.3 kB of archives.\n",
"After this operation, 132 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 bubblewrap amd64 0.6.1-1ubuntu0.1 [46.3 kB]\n",
"Fetched 46.3 kB in 0s (122 kB/s)\n",
"Selecting previously unselected package bubblewrap.\n",
"(Reading database ... 124561 files and directories currently installed.)\n",
"Preparing to unpack .../bubblewrap_0.6.1-1ubuntu0.1_amd64.deb ...\n",
"Unpacking bubblewrap (0.6.1-1ubuntu0.1) ...\n",
"Setting up bubblewrap (0.6.1-1ubuntu0.1) ...\n",
"Processing triggers for man-db (2.10.2-1) ...\n",
"Looking in indexes: https://test.pypi.org/simple/, https://pypi.python.org/simple\n",
"Collecting llama-stack==0.1.0rc10\n",
" Downloading https://test-files.pythonhosted.org/packages/68/22/4a170fbe01095df81e76c7bf8f35c716c1a0a5ec4503da6e78695fab351c/llama_stack-0.1.0rc10-py3-none-any.whl.metadata (15 kB)\n",
"Collecting blobfile (from llama-stack==0.1.0rc10)\n",
" Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)\n",
"Collecting fire (from llama-stack==0.1.0rc10)\n",
" Downloading fire-0.7.0.tar.gz (87 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.2/87.2 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (0.28.1)\n",
"Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (0.27.1)\n",
"Collecting llama-models==0.1.0rc10 (from llama-stack==0.1.0rc10)\n",
" Downloading https://test-files.pythonhosted.org/packages/45/2b/6a6947d5915054b9980f82606942f1b79960a27168299254ca12e5b5795b/llama_models-0.1.0rc10-py3-none-any.whl.metadata (8.5 kB)\n",
"Collecting llama-stack-client==0.1.0rc10 (from llama-stack==0.1.0rc10)\n",
" Downloading https://test-files.pythonhosted.org/packages/d6/85/a4fd621c4ae4db7339ab098b37bf4b4ad3cc12440e75ef10ec524e28ef7d/llama_stack_client-0.1.0rc10-py3-none-any.whl.metadata (15 kB)\n",
"Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (3.0.48)\n",
"Collecting python-dotenv (from llama-stack==0.1.0rc10)\n",
" Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n",
"Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.10.5)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.32.3)\n",
"Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (13.9.4)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (75.1.0)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from llama-stack==0.1.0rc10) (2.5.0)\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (6.0.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (3.1.5)\n",
"Collecting tiktoken (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10)\n",
" Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (11.1.0)\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (3.7.1)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (8.1.8)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.9.0)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2.2.2)\n",
"Collecting pyaml (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10)\n",
" Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.3.1)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (4.67.1)\n",
"Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (4.12.2)\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (2024.12.14)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (1.0.7)\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack==0.1.0rc10) (3.10)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx->llama-stack==0.1.0rc10) (0.14.0)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack==0.1.0rc10) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack==0.1.0rc10) (2.27.2)\n",
"Collecting pycryptodomex>=3.8 (from blobfile->llama-stack==0.1.0rc10)\n",
" Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (2.3.0)\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (5.3.0)\n",
"Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack==0.1.0rc10) (3.16.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack==0.1.0rc10) (2024.10.0)\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack==0.1.0rc10) (24.2)\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.11/dist-packages (from prompt-toolkit->llama-stack==0.1.0rc10) (0.2.13)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->llama-stack==0.1.0rc10) (3.4.1)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack==0.1.0rc10) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack==0.1.0rc10) (2.18.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack==0.1.0rc10) (0.1.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (3.0.2)\n",
"Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (2024.2)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken->llama-models==0.1.0rc10->llama-stack==0.1.0rc10) (2024.11.6)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client==0.1.0rc10->llama-stack==0.1.0rc10) (1.17.0)\n",
"Downloading https://test-files.pythonhosted.org/packages/68/22/4a170fbe01095df81e76c7bf8f35c716c1a0a5ec4503da6e78695fab351c/llama_stack-0.1.0rc10-py3-none-any.whl (532 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m532.7/532.7 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading https://test-files.pythonhosted.org/packages/45/2b/6a6947d5915054b9980f82606942f1b79960a27168299254ca12e5b5795b/llama_models-0.1.0rc10-py3-none-any.whl (1.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading https://test-files.pythonhosted.org/packages/d6/85/a4fd621c4ae4db7339ab098b37bf4b4ad3cc12440e75ef10ec524e28ef7d/llama_stack_client-0.1.0rc10-py3-none-any.whl (328 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m328.5/328.5 kB\u001b[0m \u001b[31m29.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading blobfile-3.0.0-py3-none-any.whl (75 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n",
"Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m57.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)\n",
"Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m64.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hBuilding wheels for collected packages: fire\n",
" Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=3a37285ecae37a5fb69bbad717aabdb8c13f0da7906668b7c123475eefa41c3b\n",
" Stored in directory: /root/.cache/pip/wheels/46/54/24/1624fd5b8674eb1188623f7e8e17cdf7c0f6c24b609dfb8a89\n",
"Successfully built fire\n",
"Installing collected packages: python-dotenv, pycryptodomex, pyaml, fire, tiktoken, blobfile, llama-stack-client, llama-models, llama-stack\n",
"Successfully installed blobfile-3.0.0 fire-0.7.0 llama-models-0.1.0rc10 llama-stack-0.1.0rc10 llama-stack-client-0.1.0rc10 pyaml-25.1.0 pycryptodomex-3.21.0 python-dotenv-1.0.1 tiktoken-0.8.0\n"
]
}
],
"source": [ "source": [
"# NBVAL_SKIP\n", "# NBVAL_SKIP\n",
"\n", "\n",
"!apt-get install -y bubblewrap\n", "!apt-get install -y bubblewrap\n",
"# install a branch of llama stack\n", "import os\n",
"!pip install llama-stack" "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n",
"!pip install uv\n",
"!uv pip install llama-stack"
] ]
}, },
{ {
@ -218,7 +114,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": null,
"id": "HaepEZXCDgif", "id": "HaepEZXCDgif",
"metadata": { "metadata": {
"colab": { "colab": {
@ -228,331 +124,9 @@
"id": "HaepEZXCDgif", "id": "HaepEZXCDgif",
"outputId": "9314f698-593d-4c1a-ea15-15c735dc1023" "outputId": "9314f698-593d-4c1a-ea15-15c735dc1023"
}, },
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: llama-stack in /usr/local/lib/python3.11/dist-packages (0.1.0rc10)\r\n",
"Requirement already satisfied: blobfile in /usr/local/lib/python3.11/dist-packages (from llama-stack) (3.0.0)\r\n",
"Requirement already satisfied: fire in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.7.0)\r\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.28.1)\r\n",
"Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.27.1)\r\n",
"Requirement already satisfied: llama-models==0.1.0rc10 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.1.0rc10)\r\n",
"Requirement already satisfied: llama-stack-client==0.1.0rc10 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (0.1.0rc10)\r\n",
"Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.11/dist-packages (from llama-stack) (3.0.48)\r\n",
"Requirement already satisfied: python-dotenv in /usr/local/lib/python3.11/dist-packages (from llama-stack) (1.0.1)\r\n",
"Requirement already satisfied: pydantic>=2 in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.10.5)\r\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.32.3)\r\n",
"Requirement already satisfied: rich in /usr/local/lib/python3.11/dist-packages (from llama-stack) (13.9.4)\r\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from llama-stack) (75.1.0)\r\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from llama-stack) (2.5.0)\r\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (6.0.2)\r\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (3.1.5)\r\n",
"Requirement already satisfied: tiktoken in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (0.8.0)\r\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.11/dist-packages (from llama-models==0.1.0rc10->llama-stack) (11.1.0)\r\n",
"Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (3.7.1)\r\n",
"Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (8.1.8)\r\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (1.9.0)\r\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (2.2.2)\r\n",
"Requirement already satisfied: pyaml in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (25.1.0)\r\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (1.3.1)\r\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (4.67.1)\r\n",
"Requirement already satisfied: typing-extensions<5,>=4.7 in /usr/local/lib/python3.11/dist-packages (from llama-stack-client==0.1.0rc10->llama-stack) (4.12.2)\r\n",
"Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (2024.12.14)\r\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (1.0.7)\r\n",
"Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (from httpx->llama-stack) (3.10)\r\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx->llama-stack) (0.14.0)\r\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack) (0.7.0)\r\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic>=2->llama-stack) (2.27.2)\r\n",
"Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (3.21.0)\r\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (2.3.0)\r\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (5.3.0)\r\n",
"Requirement already satisfied: filelock>=3.0 in /usr/local/lib/python3.11/dist-packages (from blobfile->llama-stack) (3.16.1)\r\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack) (2024.10.0)\r\n",
"Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub->llama-stack) (24.2)\r\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.11/dist-packages (from prompt-toolkit->llama-stack) (0.2.13)\r\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->llama-stack) (3.4.1)\r\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack) (3.0.0)\r\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich->llama-stack) (2.18.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich->llama-stack) (0.1.2)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->llama-models==0.1.0rc10->llama-stack) (3.0.2)\n",
"Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->llama-stack-client==0.1.0rc10->llama-stack) (2024.2)\n",
"Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken->llama-models==0.1.0rc10->llama-stack) (2024.11.6)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->llama-stack-client==0.1.0rc10->llama-stack) (1.17.0)\n",
"Installing pip dependencies\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)\n",
"Collecting together\n",
" Downloading together-1.3.11-py3-none-any.whl.metadata (11 kB)\n",
"Collecting datasets\n",
" Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)\n",
"Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.47.1)\n",
"Requirement already satisfied: blobfile in /usr/local/lib/python3.11/dist-packages (3.0.0)\n",
"Requirement already satisfied: opentelemetry-sdk in /usr/local/lib/python3.11/dist-packages (1.29.0)\n",
"Collecting redis\n",
" Downloading redis-5.2.1-py3-none-any.whl.metadata (9.1 kB)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (2.32.3)\n",
"Requirement already satisfied: chardet in /usr/local/lib/python3.11/dist-packages (5.2.0)\n",
"Collecting chromadb-client\n",
" Downloading chromadb_client-0.6.3-py3-none-any.whl.metadata (2.4 kB)\n",
"Collecting psycopg2-binary\n",
" Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n",
"Collecting mcp\n",
" Downloading mcp-1.2.0-py3-none-any.whl.metadata (15 kB)\n",
"Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (11.1.0)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (1.13.1)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (4.67.1)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.11/dist-packages (0.2.0)\n",
"Collecting faiss-cpu\n",
" Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)\n",
"Collecting opentelemetry-exporter-otlp-proto-http\n",
" Downloading opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl.metadata (2.2 kB)\n",
"Collecting autoevals\n",
" Downloading autoevals-0.0.117-py3-none-any.whl.metadata (12 kB)\n",
"Collecting pypdf\n",
" Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)\n",
"Collecting aiosqlite\n",
" Downloading aiosqlite-0.20.0-py3-none-any.whl.metadata (4.3 kB)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (1.26.4)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.6.0)\n",
"Requirement already satisfied: openai in /usr/local/lib/python3.11/dist-packages (1.59.6)\n",
"Collecting fastapi\n",
" Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n",
"Requirement already satisfied: fire in /usr/local/lib/python3.11/dist-packages (0.7.0)\n",
"Requirement already satisfied: httpx in /usr/local/lib/python3.11/dist-packages (0.28.1)\n",
"Collecting uvicorn\n",
" Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2024.2)\n",
"Requirement already satisfied: aiohttp<4.0.0,>=3.9.3 in /usr/local/lib/python3.11/dist-packages (from together) (3.11.11)\n",
"Requirement already satisfied: click<9.0.0,>=8.1.7 in /usr/local/lib/python3.11/dist-packages (from together) (8.1.8)\n",
"Requirement already satisfied: eval-type-backport<0.3.0,>=0.1.3 in /usr/local/lib/python3.11/dist-packages (from together) (0.2.2)\n",
"Requirement already satisfied: filelock<4.0.0,>=3.13.1 in /usr/local/lib/python3.11/dist-packages (from together) (3.16.1)\n",
"Collecting pillow\n",
" Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)\n",
"Requirement already satisfied: pyarrow>=10.0.1 in /usr/local/lib/python3.11/dist-packages (from together) (17.0.0)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.6.3 in /usr/local/lib/python3.11/dist-packages (from together) (2.10.5)\n",
"Requirement already satisfied: rich<14.0.0,>=13.8.1 in /usr/local/lib/python3.11/dist-packages (from together) (13.9.4)\n",
"Requirement already satisfied: tabulate<0.10.0,>=0.9.0 in /usr/local/lib/python3.11/dist-packages (from together) (0.9.0)\n",
"Requirement already satisfied: typer<0.16,>=0.9 in /usr/local/lib/python3.11/dist-packages (from together) (0.15.1)\n",
"Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
" Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
"Collecting xxhash (from datasets)\n",
" Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
"Collecting multiprocess<0.70.17 (from datasets)\n",
" Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)\n",
"Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)\n",
" Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
"Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.27.1)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n",
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.0)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.2)\n",
"Requirement already satisfied: pycryptodomex>=3.8 in /usr/local/lib/python3.11/dist-packages (from blobfile) (3.21.0)\n",
"Requirement already satisfied: urllib3<3,>=1.25.3 in /usr/local/lib/python3.11/dist-packages (from blobfile) (2.3.0)\n",
"Requirement already satisfied: lxml>=4.9 in /usr/local/lib/python3.11/dist-packages (from blobfile) (5.3.0)\n",
"Requirement already satisfied: opentelemetry-api==1.29.0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (1.29.0)\n",
"Requirement already satisfied: opentelemetry-semantic-conventions==0.50b0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (0.50b0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-sdk) (4.12.2)\n",
"Requirement already satisfied: deprecated>=1.2.6 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-api==1.29.0->opentelemetry-sdk) (1.2.15)\n",
"Requirement already satisfied: importlib-metadata<=8.5.0,>=6.0 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-api==1.29.0->opentelemetry-sdk) (8.5.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.55.3)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests) (3.4.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests) (3.10)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests) (2024.12.14)\n",
"Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb-client)\n",
" Downloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl.metadata (2.2 kB)\n",
"Collecting overrides>=7.3.1 (from chromadb-client)\n",
" Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)\n",
"Collecting posthog>=2.4.0 (from chromadb-client)\n",
" Downloading posthog-3.8.4-py2.py3-none-any.whl.metadata (2.8 kB)\n",
"Requirement already satisfied: tenacity>=8.2.3 in /usr/local/lib/python3.11/dist-packages (from chromadb-client) (9.0.0)\n",
"Requirement already satisfied: orjson>=3.9.12 in /usr/local/lib/python3.11/dist-packages (from chromadb-client) (3.10.14)\n",
"Collecting anyio>=4.5 (from mcp)\n",
" Downloading anyio-4.8.0-py3-none-any.whl.metadata (4.6 kB)\n",
"Collecting httpx-sse>=0.4 (from mcp)\n",
" Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)\n",
"Collecting pydantic-settings>=2.6.1 (from mcp)\n",
" Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)\n",
"Collecting sse-starlette>=1.6.1 (from mcp)\n",
" Downloading sse_starlette-2.2.1-py3-none-any.whl.metadata (7.8 kB)\n",
"Collecting starlette>=0.27 (from mcp)\n",
" Downloading starlette-0.45.2-py3-none-any.whl.metadata (6.3 kB)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk) (1.4.2)\n",
"Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-exporter-otlp-proto-http) (1.66.0)\n",
"Collecting opentelemetry-exporter-otlp-proto-common==1.29.0 (from opentelemetry-exporter-otlp-proto-http)\n",
" Downloading opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl.metadata (1.8 kB)\n",
"Collecting opentelemetry-proto==1.29.0 (from opentelemetry-exporter-otlp-proto-http)\n",
" Downloading opentelemetry_proto-1.29.0-py3-none-any.whl.metadata (2.3 kB)\n",
"Collecting protobuf<6.0,>=5.0 (from opentelemetry-proto==1.29.0->opentelemetry-exporter-otlp-proto-http)\n",
" Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)\n",
"Collecting chevron (from autoevals)\n",
" Downloading chevron-0.14.0-py3-none-any.whl.metadata (4.9 kB)\n",
"Collecting levenshtein (from autoevals)\n",
" Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)\n",
"Collecting braintrust_core==0.0.58 (from autoevals)\n",
" Downloading braintrust_core-0.0.58-py3-none-any.whl.metadata (669 bytes)\n",
"Requirement already satisfied: jsonschema in /usr/local/lib/python3.11/dist-packages (from autoevals) (4.23.0)\n",
"Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.5.0)\n",
"Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.11/dist-packages (from openai) (1.9.0)\n",
"Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.11/dist-packages (from openai) (0.8.2)\n",
"Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai) (1.3.1)\n",
"Collecting starlette>=0.27 (from mcp)\n",
" Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.11/dist-packages (from fire) (2.5.0)\n",
"Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx) (1.0.7)\n",
"Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.11/dist-packages (from httpcore==1.*->httpx) (0.14.0)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (2.4.4)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.3.2)\n",
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (24.3.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (6.1.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (0.2.1)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp<4.0.0,>=3.9.3->together) (1.18.3)\n",
"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.11/dist-packages (from deprecated>=1.2.6->opentelemetry-api==1.29.0->opentelemetry-sdk) (1.17.0)\n",
"Requirement already satisfied: grpcio<2.0.0,>=1.63.2 in /usr/local/lib/python3.11/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb-client) (1.69.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from posthog>=2.4.0->chromadb-client) (1.17.0)\n",
"Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb-client)\n",
" Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)\n",
"Collecting backoff>=1.10.0 (from posthog>=2.4.0->chromadb-client)\n",
" Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.6.3->together) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.27.2 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.6.3->together) (2.27.2)\n",
"Requirement already satisfied: python-dotenv>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from pydantic-settings>=2.6.1->mcp) (1.0.1)\n",
"Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.8.1->together) (3.0.0)\n",
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.11/dist-packages (from rich<14.0.0,>=13.8.1->together) (2.18.0)\n",
"Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.11/dist-packages (from typer<0.16,>=0.9->together) (1.5.4)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (2024.10.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (0.35.1)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema->autoevals) (0.22.3)\n",
"Collecting rapidfuzz<4.0.0,>=3.9.0 (from levenshtein->autoevals)\n",
" Downloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
"Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.11/dist-packages (from importlib-metadata<=8.5.0,>=6.0->opentelemetry-api==1.29.0->opentelemetry-sdk) (3.21.0)\n",
"Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.11/dist-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.8.1->together) (0.1.2)\n",
"Downloading together-1.3.11-py3-none-any.whl (70 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.6/70.6 kB\u001b[0m \u001b[31m7.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading redis-5.2.1-py3-none-any.whl (261 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.5/261.5 kB\u001b[0m \u001b[31m25.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading chromadb_client-0.6.3-py3-none-any.whl (609 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m609.2/609.2 kB\u001b[0m \u001b[31m38.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m100.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading mcp-1.2.0-py3-none-any.whl (66 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.5/66.5 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m106.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m27.5/27.5 MB\u001b[0m \u001b[31m78.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading opentelemetry_exporter_otlp_proto_http-1.29.0-py3-none-any.whl (17 kB)\n",
"Downloading opentelemetry_exporter_otlp_proto_common-1.29.0-py3-none-any.whl (18 kB)\n",
"Downloading opentelemetry_proto-1.29.0-py3-none-any.whl (55 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading autoevals-0.0.117-py3-none-any.whl (41 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.4/41.4 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading braintrust_core-0.0.58-py3-none-any.whl (4.4 kB)\n",
"Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.0/298.0 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading aiosqlite-0.20.0-py3-none-any.whl (15 kB)\n",
"Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.8/94.8 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading uvicorn-0.34.0-py3-none-any.whl (62 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.3/62.3 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading anyio-4.8.0-py3-none-any.whl (96 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.0/96.0 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)\n",
"Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.5/143.5 kB\u001b[0m \u001b[31m14.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading opentelemetry_exporter_otlp_proto_grpc-1.29.0-py3-none-any.whl (18 kB)\n",
"Downloading overrides-7.7.0-py3-none-any.whl (17 kB)\n",
"Downloading posthog-3.8.4-py2.py3-none-any.whl (69 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.8/69.8 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading pydantic_settings-2.7.1-py3-none-any.whl (29 kB)\n",
"Downloading sse_starlette-2.2.1-py3-none-any.whl (10 kB)\n",
"Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading chevron-0.14.0-py3-none-any.whl (11 kB)\n",
"Downloading levenshtein-0.26.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m162.7/162.7 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.8/194.8 kB\u001b[0m \u001b[31m21.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)\n",
"Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n",
"Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m319.7/319.7 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading rapidfuzz-3.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m84.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: monotonic, chevron, xxhash, uvicorn, redis, rapidfuzz, pypdf, psycopg2-binary, protobuf, pillow, overrides, httpx-sse, fsspec, faiss-cpu, dill, braintrust_core, backoff, anyio, aiosqlite, starlette, posthog, opentelemetry-proto, multiprocess, levenshtein, sse-starlette, pydantic-settings, opentelemetry-exporter-otlp-proto-common, fastapi, together, mcp, datasets, autoevals, opentelemetry-exporter-otlp-proto-http, opentelemetry-exporter-otlp-proto-grpc, chromadb-client\n",
" Attempting uninstall: protobuf\n",
" Found existing installation: protobuf 4.25.5\n",
" Uninstalling protobuf-4.25.5:\n",
" Successfully uninstalled protobuf-4.25.5\n",
" Attempting uninstall: pillow\n",
" Found existing installation: pillow 11.1.0\n",
" Uninstalling pillow-11.1.0:\n",
" Successfully uninstalled pillow-11.1.0\n",
" Attempting uninstall: fsspec\n",
" Found existing installation: fsspec 2024.10.0\n",
" Uninstalling fsspec-2024.10.0:\n",
" Successfully uninstalled fsspec-2024.10.0\n",
" Attempting uninstall: anyio\n",
" Found existing installation: anyio 3.7.1\n",
" Uninstalling anyio-3.7.1:\n",
" Successfully uninstalled anyio-3.7.1\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"jupyter-server 1.24.0 requires anyio<4,>=3.1.0, but you have anyio 4.8.0 which is incompatible.\n",
"gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
"tensorflow 2.17.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.3 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0mSuccessfully installed aiosqlite-0.20.0 anyio-4.8.0 autoevals-0.0.117 backoff-2.2.1 braintrust_core-0.0.58 chevron-0.14.0 chromadb-client-0.6.3 datasets-3.2.0 dill-0.3.8 faiss-cpu-1.9.0.post1 fastapi-0.115.6 fsspec-2024.9.0 httpx-sse-0.4.0 levenshtein-0.26.1 mcp-1.2.0 monotonic-1.6 multiprocess-0.70.16 opentelemetry-exporter-otlp-proto-common-1.29.0 opentelemetry-exporter-otlp-proto-grpc-1.29.0 opentelemetry-exporter-otlp-proto-http-1.29.0 opentelemetry-proto-1.29.0 overrides-7.7.0 pillow-10.4.0 posthog-3.8.4 protobuf-5.29.3 psycopg2-binary-2.9.10 pydantic-settings-2.7.1 pypdf-5.1.0 rapidfuzz-3.11.0 redis-5.2.1 sse-starlette-2.2.1 starlette-0.41.3 together-1.3.11 uvicorn-0.34.0 xxhash-3.5.0\n",
"torch --index-url https://download.pytorch.org/whl/cpu\n",
"Looking in indexes: https://download.pytorch.org/whl/cpu\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.5.1+cu121)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.16.1)\n",
"Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.12.2)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.5)\n",
"Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2024.9.0)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.3.1)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.11/dist-packages (from torch) (11.0.2.54)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.11/dist-packages (from torch) (10.3.2.106)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.11/dist-packages (from torch) (11.4.5.107)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.0.106)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.11/dist-packages (from torch) (12.1.105)\n",
"Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.0)\n",
"Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.11/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch) (12.6.85)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n",
"sentence-transformers --no-deps\n",
"Requirement already satisfied: sentence-transformers in /usr/local/lib/python3.11/dist-packages (3.3.1)\n",
"\u001b[32mBuild Successful!\u001b[0m\n"
]
}
],
"source": [ "source": [
"# NBVAL_SKIP\n", "# NBVAL_SKIP\n",
"\n",
"# This will build all the dependencies you will need\n", "# This will build all the dependencies you will need\n",
"!llama stack build --template together --image-type venv" "!llama stack build --template together --image-type venv"
] ]
@ -571,7 +145,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 4,
"id": "E1UFuJC570Tk", "id": "E1UFuJC570Tk",
"metadata": { "metadata": {
"colab": { "colab": {
@ -1125,11 +699,8 @@
" if not api_key:\n", " if not api_key:\n",
" raise ValueError(f\"{key} environment variable is empty\")\n", " raise ValueError(f\"{key} environment variable is empty\")\n",
" except KeyError:\n", " except KeyError:\n",
" raise KeyError(\n", " api_key = input(f\"{key} environment variable is not set. Please enter your API key: \")\n",
" f\"{key} environment variable is not set. \"\n", " os.environ[key] = api_key\n",
" \"Please set your API key using in userdata (if using google colab notebook)\"\n",
" f\"or using `export {key}='your-api-key-here'`\"\n",
" ) from None\n",
"\n", "\n",
"from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n", "from llama_stack.distribution.library_client import LlamaStackAsLibraryClient\n",
"client = LlamaStackAsLibraryClient(\"together\", provider_data = {\"tavily_search_api_key\": os.environ['TAVILY_SEARCH_API_KEY']})\n", "client = LlamaStackAsLibraryClient(\"together\", provider_data = {\"tavily_search_api_key\": os.environ['TAVILY_SEARCH_API_KEY']})\n",
@ -1150,7 +721,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 5,
"id": "ruO9jQna_t_S", "id": "ruO9jQna_t_S",
"metadata": { "metadata": {
"colab": { "colab": {
@ -1211,7 +782,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 6,
"id": "LINBvv8lwTJh", "id": "LINBvv8lwTJh",
"metadata": { "metadata": {
"colab": { "colab": {
@ -1228,7 +799,7 @@
"'meta-llama/Llama-3.1-70B-Instruct'" "'meta-llama/Llama-3.1-70B-Instruct'"
] ]
}, },
"execution_count": 4, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -1253,7 +824,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 7,
"id": "77c29dba", "id": "77c29dba",
"metadata": { "metadata": {
"colab": { "colab": {
@ -1267,7 +838,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Here's a two-sentence poem about a llama:\n", "Here is a two-sentence poem about a llama:\n",
"\n", "\n",
"With gentle eyes and a soft, fuzzy face,\n", "With gentle eyes and a soft, fuzzy face,\n",
"The llama roams, a peaceful, gentle pace.\n" "The llama roams, a peaceful, gentle pace.\n"
@ -2084,13 +1655,14 @@
} }
], ],
"source": [ "source": [
"import uuid\n",
"from llama_stack_client.lib.agents.agent import Agent\n", "from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n", "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n", "from llama_stack_client.types.agent_create_params import AgentConfig\n",
"from termcolor import cprint\n", "from termcolor import cprint\n",
"from llama_stack_client.types import Document\n", "from llama_stack_client.types import Document\n",
"\n", "\n",
"urls = [\"chat.rst\", \"llama3.rst\", \"datasets.rst\", \"lora_finetune.rst\"]\n", "urls = [\"chat.rst\", \"llama3.rst\", \"memory_optimizations.rst\", \"lora_finetune.rst\"]\n",
"documents = [\n", "documents = [\n",
" Document(\n", " Document(\n",
" document_id=f\"num-{i}\",\n", " document_id=f\"num-{i}\",\n",
@ -2101,7 +1673,7 @@
" for i, url in enumerate(urls)\n", " for i, url in enumerate(urls)\n",
"]\n", "]\n",
"\n", "\n",
"vector_db_id = \"test-vector-db\"\n", "vector_db_id = f\"test-vector-db-{uuid.uuid4().hex}\"\n",
"client.vector_dbs.register(\n", "client.vector_dbs.register(\n",
" vector_db_id=vector_db_id,\n", " vector_db_id=vector_db_id,\n",
" embedding_model=\"all-MiniLM-L6-v2\",\n", " embedding_model=\"all-MiniLM-L6-v2\",\n",
@ -2398,6 +1970,7 @@
} }
], ],
"source": [ "source": [
"# NBVAL_SKIP\n",
"!pip install colab-xterm #https://pypi.org/project/colab-xterm/\n", "!pip install colab-xterm #https://pypi.org/project/colab-xterm/\n",
"%load_ext colabxterm" "%load_ext colabxterm"
] ]
@ -2774,7 +2347,7 @@
} }
], ],
"source": [ "source": [
"\n", "# NBVAL_SKIP\n",
"%xterm\n", "%xterm\n",
"# touch /content/foo\n", "# touch /content/foo\n",
"# touch /content/bar\n", "# touch /content/bar\n",
@ -2800,6 +2373,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# NBVAL_SKIP\n",
"from llama_stack_client.types.shared_params.url import URL\n", "from llama_stack_client.types.shared_params.url import URL\n",
"client.toolgroups.register(\n", "client.toolgroups.register(\n",
" toolgroup_id=\"mcp::filesystem\",\n", " toolgroup_id=\"mcp::filesystem\",\n",
@ -3170,6 +2744,7 @@
} }
], ],
"source": [ "source": [
"# NBVAL_SKIP\n",
"from llama_stack_client.lib.agents.agent import Agent\n", "from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n", "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n", "from llama_stack_client.types.agent_create_params import AgentConfig\n",
@ -3821,6 +3396,231 @@
"response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n", "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n",
"pprint(response)\n" "pprint(response)\n"
] ]
},
{
"cell_type": "markdown",
"id": "ad077440",
"metadata": {},
"source": [
"## 4. Image Understanding with Llama 3.2\n",
"\n",
"Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
]
},
{
"cell_type": "markdown",
"id": "82e381ec",
"metadata": {},
"source": [
"### 4.1 Setup and helpers\n",
"\n",
"Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "865fc5a8",
"metadata": {},
"outputs": [],
"source": [
"!pip install llama-stack-client==0.1.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44e05e16",
"metadata": {},
"outputs": [],
"source": [
"!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "469750f7",
"metadata": {},
"outputs": [],
"source": [
"from PIL import Image\n",
"import matplotlib.pyplot as plt\n",
"\n",
"def display_image(path):\n",
" img = Image.open(path)\n",
" plt.imshow(img)\n",
" plt.axis('off')\n",
" plt.show()\n",
"\n",
"display_image(\"Llama_Repo.jpeg\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2c1e1c2",
"metadata": {},
"outputs": [],
"source": [
"import base64\n",
"\n",
"def encode_image(image_path):\n",
" with open(image_path, \"rb\") as image_file:\n",
" base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
" base64_url = f\"data:image/png;base64,{base64_string}\"\n",
" return base64_url"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c565f99e",
"metadata": {},
"outputs": [],
"source": [
"from llama_stack_client import LlamaStackClient\n",
"\n",
"LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
"LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
]
},
{
"cell_type": "markdown",
"id": "7737cd41",
"metadata": {},
"source": [
"### 4.2 Using Llama Stack Chat API\n",
"\n",
"The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7914894",
"metadata": {},
"outputs": [],
"source": [
"from llama_stack_client.lib.inference.event_logger import EventLogger\n",
"\n",
"async def run_main(image_path: str, prompt):\n",
" client = LlamaStackClient(\n",
" base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
" )\n",
"\n",
" message = {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"image\": {\n",
" \"url\": {\n",
" \"uri\": encode_image(image_path)\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": prompt,\n",
" }\n",
" ]\n",
" }\n",
"\n",
" response = client.inference.chat_completion(\n",
" messages=[message],\n",
" model_id=LLAMA32_11B_INSTRUCT,\n",
" stream=False,\n",
" )\n",
"\n",
" print(response.completion_message.content.lower().strip())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ee09b97",
"metadata": {},
"outputs": [],
"source": [
"await run_main(\"Llama_Repo.jpeg\",\n",
" \"How many different colors are those llamas?\\\n",
" What are those colors?\")"
]
},
{
"cell_type": "markdown",
"id": "e741d7b9",
"metadata": {},
"source": [
"### 4.3 Using Llama Stack Agent API\n",
"\n",
"The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f9a83275",
"metadata": {},
"outputs": [],
"source": [
"from llama_stack_client.lib.agents.agent import Agent\n",
"from llama_stack_client.lib.agents.event_logger import EventLogger\n",
"from llama_stack_client.types.agent_create_params import AgentConfig\n",
"\n",
"async def run_main(image_path, prompt):\n",
" base64_image = encode_image(image_path)\n",
"\n",
" client = LlamaStackClient(\n",
" base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
" )\n",
"\n",
" agent_config = AgentConfig(\n",
" model=LLAMA32_11B_INSTRUCT,\n",
" instructions=\"You are a helpful assistant\",\n",
" enable_session_persistence=False,\n",
" )\n",
"\n",
" agent = Agent(client, agent_config)\n",
" session_id = agent.create_session(\"test-session\")\n",
"\n",
" response = agent.create_turn(\n",
" messages=[{\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\n",
" \"type\": \"image\",\n",
" \"image\": {\n",
" \"url\": {\n",
" \"uri\": encode_image(image_path)\n",
" }\n",
" }\n",
" },\n",
" {\n",
" \"type\": \"text\",\n",
" \"text\": prompt,\n",
" }\n",
" ]\n",
" }],\n",
" session_id=session_id,\n",
" )\n",
"\n",
" for log in EventLogger().log(response):\n",
" log.print()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15d0098b",
"metadata": {},
"outputs": [],
"source": [
"await run_main(\"Llama_Repo.jpeg\",\n",
" \"How many different colors are those llamas?\\\n",
" What are those colors?\")"
]
} }
], ],
"metadata": { "metadata": {
@ -3830,7 +3630,8 @@
"provenance": [] "provenance": []
}, },
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "toolchain",
"language": "python",
"name": "python3" "name": "python3"
}, },
"language_info": { "language_info": {

View file

@ -6,7 +6,7 @@
"id": "hTIfyoGtjoWD" "id": "hTIfyoGtjoWD"
}, },
"source": [ "source": [
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1UvR9m2KTinvlDXeOWfS2HBU4X72LAjTz?usp=sharing)\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)\n",
"\n", "\n",
"# Llama Stack Benchmark Evals\n", "# Llama Stack Benchmark Evals\n",
"\n", "\n",
@ -1383,7 +1383,8 @@
"provenance": [] "provenance": []
}, },
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "master",
"language": "python",
"name": "python3" "name": "python3"
}, },
"language_info": { "language_info": {

View file

@ -36,6 +36,16 @@ from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification # noqa: E402 from .pyopenapi.utility import Specification # noqa: E402
def str_presenter(dumper, data):
if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
"#/components/schemas/"
):
style = None
else:
style = ">" if "\n" in data or len(data) > 40 else None
return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
def main(output_dir: str): def main(output_dir: str):
output_dir = Path(output_dir) output_dir = Path(output_dir)
if not output_dir.exists(): if not output_dir.exists():
@ -69,7 +79,8 @@ def main(output_dir: str):
y.sequence_dash_offset = 2 y.sequence_dash_offset = 2
y.width = 80 y.width = 80
y.allow_unicode = True y.allow_unicode = True
y.explicit_start = True y.representer.add_representer(str, str_presenter)
y.dump( y.dump(
spec.get_json(), spec.get_json(),
fp, fp,

View file

@ -4,10 +4,10 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import collections
import hashlib import hashlib
import ipaddress import ipaddress
import typing import typing
from dataclasses import make_dataclass
from typing import Any, Dict, Set, Union from typing import Any, Dict, Set, Union
from ..strong_typing.core import JsonType from ..strong_typing.core import JsonType
@ -177,20 +177,37 @@ class ContentBuilder:
) -> Dict[str, MediaType]: ) -> Dict[str, MediaType]:
"Creates the content subtree for a request or response." "Creates the content subtree for a request or response."
def has_iterator_type(t): def is_iterator_type(t):
if typing.get_origin(t) is typing.Union:
return any(has_iterator_type(a) for a in typing.get_args(t))
else:
# TODO: needs a proper fix where we let all types correctly flow upwards
# and then test against AsyncIterator
return "StreamChunk" in str(t) return "StreamChunk" in str(t)
def get_media_type(t):
if is_generic_list(t):
return "application/jsonl"
elif is_iterator_type(t):
return "text/event-stream"
else:
return "application/json"
if typing.get_origin(payload_type) is typing.Union:
media_types = []
item_types = []
for x in typing.get_args(payload_type):
media_types.append(get_media_type(x))
item_types.append(x)
if len(set(media_types)) == 1:
# all types have the same media type
return {media_types[0]: self.build_media_type(payload_type, examples)}
else:
# different types have different media types
return {
media_type: self.build_media_type(item_type, examples)
for media_type, item_type in zip(media_types, item_types)
}
if is_generic_list(payload_type): if is_generic_list(payload_type):
media_type = "application/jsonl" media_type = "application/jsonl"
item_type = unwrap_generic_list(payload_type) item_type = unwrap_generic_list(payload_type)
elif has_iterator_type(payload_type):
item_type = payload_type
media_type = "text/event-stream"
else: else:
media_type = "application/json" media_type = "application/json"
item_type = payload_type item_type = payload_type
@ -233,7 +250,9 @@ class ContentBuilder:
value = sample_transformer(object_to_json(example)) value = sample_transformer(object_to_json(example))
hash_string = ( hash_string = (
hashlib.md5(json_dump_string(value).encode("utf-8")).digest().hex() hashlib.sha256(json_dump_string(value).encode("utf-8"))
.digest()
.hex()[:16]
) )
name = f"ex-{hash_string}" name = f"ex-{hash_string}"
@ -276,6 +295,20 @@ class StatusResponse:
examples: List[Any] = dataclasses.field(default_factory=list) examples: List[Any] = dataclasses.field(default_factory=list)
def create_docstring_for_request(
request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
) -> str:
"""Creates a ReST-style docstring for a dynamically generated request dataclass."""
lines = ["\n"] # Short description
# Add parameter documentation in ReST format
for name, type_ in fields:
desc = doc_params.get(name, "")
lines.append(f":param {name}: {desc}")
return "\n".join(lines)
class ResponseBuilder: class ResponseBuilder:
content_builder: ContentBuilder content_builder: ContentBuilder
@ -493,11 +526,24 @@ class Generator:
first = next(iter(op.request_params)) first = next(iter(op.request_params))
request_name, request_type = first request_name, request_type = first
from dataclasses import make_dataclass
op_name = "".join(word.capitalize() for word in op.name.split("_")) op_name = "".join(word.capitalize() for word in op.name.split("_"))
request_name = f"{op_name}Request" request_name = f"{op_name}Request"
request_type = make_dataclass(request_name, op.request_params) fields = [
(
name,
type_,
)
for name, type_ in op.request_params
]
request_type = make_dataclass(
request_name,
fields,
namespace={
"__doc__": create_docstring_for_request(
request_name, fields, doc_params
)
},
)
requestBody = RequestBody( requestBody = RequestBody(
content={ content={
@ -598,10 +644,14 @@ class Generator:
else: else:
callbacks = None callbacks = None
description = "\n".join(
filter(None, [doc_string.short_description, doc_string.long_description])
)
return Operation( return Operation(
tags=[op.defining_class.__name__], tags=[op.defining_class.__name__],
summary=doc_string.short_description, summary=None,
description=doc_string.long_description, # summary=doc_string.short_description,
description=description,
parameters=parameters, parameters=parameters,
requestBody=requestBody, requestBody=requestBody,
responses=responses, responses=responses,
@ -633,6 +683,7 @@ class Generator:
raise NotImplementedError(f"unknown HTTP method: {op.http_method}") raise NotImplementedError(f"unknown HTTP method: {op.http_method}")
route = op.get_route() route = op.get_route()
route = route.replace(":path", "")
print(f"route: {route}") print(f"route: {route}")
if route in paths: if route in paths:
paths[route].update(pathItem) paths[route].update(pathItem)
@ -650,12 +701,6 @@ class Generator:
) )
) )
# types that are produced/consumed by operations
type_tags = [
self._build_type_tag(ref, schema)
for ref, schema in self.schema_builder.schemas.items()
]
# types that are emitted by events # types that are emitted by events
event_tags: List[Tag] = [] event_tags: List[Tag] = []
events = get_endpoint_events(self.endpoint) events = get_endpoint_events(self.endpoint)
@ -682,7 +727,6 @@ class Generator:
# list all operations and types # list all operations and types
tags: List[Tag] = [] tags: List[Tag] = []
tags.extend(operation_tags) tags.extend(operation_tags)
tags.extend(type_tags)
tags.extend(event_tags) tags.extend(event_tags)
for extra_tag_group in extra_tag_groups.values(): for extra_tag_group in extra_tag_groups.values():
tags.extend(extra_tag_group) tags.extend(extra_tag_group)
@ -697,13 +741,6 @@ class Generator:
tags=sorted(tag.name for tag in operation_tags), tags=sorted(tag.name for tag in operation_tags),
) )
) )
if type_tags:
tag_groups.append(
TagGroup(
name=self.options.map("Types"),
tags=sorted(tag.name for tag in type_tags),
)
)
if event_tags: if event_tags:
tag_groups.append( tag_groups.append(
TagGroup( TagGroup(

View file

@ -130,6 +130,8 @@ class _FormatParameterExtractor:
def _get_route_parameters(route: str) -> List[str]: def _get_route_parameters(route: str) -> List[str]:
extractor = _FormatParameterExtractor() extractor = _FormatParameterExtractor()
# Replace all occurrences of ":path" with empty string
route = route.replace(":path", "")
route.format_map(extractor) route.format_map(extractor)
return extractor.keys return extractor.keys

View file

@ -6,36 +6,36 @@
<meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="viewport" content="width=device-width, initial-scale=1">
<title>OpenAPI specification</title> <title>OpenAPI specification</title>
<link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet"> <link href="https://fonts.googleapis.com/css?family=Montserrat:300,400,700|Roboto:300,400,700" rel="stylesheet">
<script type="module" src="https://unpkg.com/@stoplight/elements/web-components.min.js"></script>
<link rel="stylesheet" href="https://unpkg.com/@stoplight/elements/styles.min.css">
<style> <style>
body { body {
margin: 0; margin: 0;
padding: 0; padding: 0;
height: 100vh;
} }
elements-api {
height: 100%;
}
</style> </style>
<script defer="defer" src="https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js"></script> </head>
<script defer="defer">
<body>
<elements-api id="openapi-container" router="hash" layout="sidebar" hideExport="true"
hideInternal="true"></elements-api>
<script>
document.addEventListener("DOMContentLoaded", function () { document.addEventListener("DOMContentLoaded", function () {
spec = { /* OPENAPI_SPECIFICATION */ }; const spec = { /* OPENAPI_SPECIFICATION */ };
options = { const element = document.getElementById("openapi-container");
downloadFileName: "openapi.json", element.apiDescriptionDocument = spec;
expandResponses: "200",
expandSingleSchemaField: true,
jsonSampleExpandLevel: "all",
schemaExpansionLevel: "all",
};
element = document.getElementById("openapi-container");
Redoc.init(spec, options, element);
if (spec.info && spec.info.title) { if (spec.info && spec.info.title) {
document.title = spec.info.title; document.title = spec.info.title;
} }
}); });
</script> </script>
</head>
<body>
<div id="openapi-container"></div>
</body> </body>
</html> </html>

View file

@ -29,4 +29,5 @@ fi
stack_dir=$(dirname $(dirname $THIS_DIR)) stack_dir=$(dirname $(dirname $THIS_DIR))
models_dir=$(dirname $stack_dir)/llama-models models_dir=$(dirname $stack_dir)/llama-models
PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/resources PYTHONPATH=$PYTHONPATH:$stack_dir:$models_dir \
python -m docs.openapi_generator.generate $(dirname $THIS_DIR)/_static

View file

@ -109,10 +109,10 @@ def get_class_property_docstrings(
def docstring_to_schema(data_type: type) -> Schema: def docstring_to_schema(data_type: type) -> Schema:
short_description, long_description = get_class_docstrings(data_type) short_description, long_description = get_class_docstrings(data_type)
schema: Schema = {} schema: Schema = {}
if short_description:
schema["title"] = short_description description = "\n".join(filter(None, [short_description, long_description]))
if long_description: if description:
schema["description"] = long_description schema["description"] = description
return schema return schema
@ -248,7 +248,9 @@ class JsonSchemaGenerator:
type_schema.update(self._metadata_to_schema(m)) type_schema.update(self._metadata_to_schema(m))
return type_schema return type_schema
def _simple_type_to_schema(self, typ: TypeLike) -> Optional[Schema]: def _simple_type_to_schema(
self, typ: TypeLike, json_schema_extra: Optional[dict] = None
) -> Optional[Schema]:
""" """
Returns the JSON schema associated with a simple, unrestricted type. Returns the JSON schema associated with a simple, unrestricted type.
@ -264,6 +266,11 @@ class JsonSchemaGenerator:
elif typ is float: elif typ is float:
return {"type": "number"} return {"type": "number"}
elif typ is str: elif typ is str:
if json_schema_extra and "contentEncoding" in json_schema_extra:
return {
"type": "string",
"contentEncoding": json_schema_extra["contentEncoding"],
}
return {"type": "string"} return {"type": "string"}
elif typ is bytes: elif typ is bytes:
return {"type": "string", "contentEncoding": "base64"} return {"type": "string", "contentEncoding": "base64"}
@ -303,7 +310,12 @@ class JsonSchemaGenerator:
# not a simple type # not a simple type
return None return None
def type_to_schema(self, data_type: TypeLike, force_expand: bool = False) -> Schema: def type_to_schema(
self,
data_type: TypeLike,
force_expand: bool = False,
json_schema_extra: Optional[dict] = None,
) -> Schema:
""" """
Returns the JSON schema associated with a type. Returns the JSON schema associated with a type.
@ -313,7 +325,7 @@ class JsonSchemaGenerator:
""" """
# short-circuit for common simple types # short-circuit for common simple types
schema = self._simple_type_to_schema(data_type) schema = self._simple_type_to_schema(data_type, json_schema_extra)
if schema is not None: if schema is not None:
return schema return schema
@ -486,15 +498,9 @@ class JsonSchemaGenerator:
property_docstrings = get_class_property_docstrings( property_docstrings = get_class_property_docstrings(
typ, self.options.property_description_fun typ, self.options.property_description_fun
) )
properties: Dict[str, Schema] = {} properties: Dict[str, Schema] = {}
required: List[str] = [] required: List[str] = []
for property_name, property_type in get_class_properties(typ): for property_name, property_type in get_class_properties(typ):
defaults = {}
if "model_fields" in members:
f = members["model_fields"]
defaults = {k: finfo.default for k, finfo in f.items()}
# rename property if an alias name is specified # rename property if an alias name is specified
alias = get_annotation(property_type, Alias) alias = get_annotation(property_type, Alias)
if alias: if alias:
@ -502,11 +508,22 @@ class JsonSchemaGenerator:
else: else:
output_name = property_name output_name = property_name
defaults = {}
json_schema_extra = None
if "model_fields" in members:
f = members["model_fields"]
defaults = {k: finfo.default for k, finfo in f.items()}
json_schema_extra = f.get(output_name, None).json_schema_extra
if is_type_optional(property_type): if is_type_optional(property_type):
optional_type: type = unwrap_optional_type(property_type) optional_type: type = unwrap_optional_type(property_type)
property_def = self.type_to_schema(optional_type) property_def = self.type_to_schema(
optional_type, json_schema_extra=json_schema_extra
)
else: else:
property_def = self.type_to_schema(property_type) property_def = self.type_to_schema(
property_type, json_schema_extra=json_schema_extra
)
required.append(output_name) required.append(output_name)
# check if attribute has a default value initializer # check if attribute has a default value initializer
@ -531,6 +548,7 @@ class JsonSchemaGenerator:
# add property docstring if available # add property docstring if available
property_doc = property_docstrings.get(property_name) property_doc = property_docstrings.get(property_name)
if property_doc: if property_doc:
# print(output_name, property_doc)
property_def.pop("title", None) property_def.pop("title", None)
property_def["description"] = property_doc property_def["description"] = property_doc

View file

@ -6,6 +6,6 @@ Here's a collection of comprehensive guides, examples, and resources for buildin
Try out Llama Stack's capabilities through our detailed Jupyter notebooks: Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
* [Building AI Applications Notebook](./notebooks/Llama_Stack_Building_AI_Applications.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack * [Building AI Applications Notebook](./getting_started.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
* [Benchmark Evaluations Notebook](./notebooks/Llama_Stack_Benchmark_Evals.ipynb) - Detailed performance evaluations and benchmarking results * [Benchmark Evaluations Notebook](./notebooks/Llama_Stack_Benchmark_Evals.ipynb) - Detailed performance evaluations and benchmarking results
* [Zero-to-Hero Guide](./notebooks/Llama_Stack_Zero_to_Hero_Guide.ipynb) - Step-by-step guide for getting started with Llama Stack * [Zero-to-Hero Guide](./zero_to_hero_guide) - Step-by-step guide for getting started with Llama Stack

View file

@ -77,7 +77,7 @@ agent_config = AgentConfig(
instructions="You are a helpful assistant", instructions="You are a helpful assistant",
# Enable both RAG and tool usage # Enable both RAG and tool usage
toolgroups=[ toolgroups=[
{"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}}. {"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}},
"builtin::code_interpreter", "builtin::code_interpreter",
], ],
# Configure safety # Configure safety
@ -86,13 +86,9 @@ agent_config = AgentConfig(
# Control the inference loop # Control the inference loop
max_infer_iters=5, max_infer_iters=5,
sampling_params={ sampling_params={
"strategy": { "strategy": {"type": "top_p", "temperature": 0.7, "top_p": 0.95},
"type": "top_p", "max_tokens": 2048,
"temperature": 0.7,
"top_p": 0.95
}, },
"max_tokens": 2048
}
) )
agent = Agent(client, agent_config) agent = Agent(client, agent_config)
@ -101,11 +97,13 @@ session_id = agent.create_session("monitored_session")
# Stream the agent's execution steps # Stream the agent's execution steps
response = agent.create_turn( response = agent.create_turn(
messages=[{"role": "user", "content": "Analyze this code and run it"}], messages=[{"role": "user", "content": "Analyze this code and run it"}],
attachments=[{ attachments=[
{
"content": "https://raw.githubusercontent.com/example/code.py", "content": "https://raw.githubusercontent.com/example/code.py",
"mime_type": "text/plain" "mime_type": "text/plain",
}], }
session_id=session_id ],
session_id=session_id,
) )
# Monitor each step of execution # Monitor each step of execution

View file

@ -15,6 +15,7 @@ This first example walks you through how to evaluate a model candidate served by
```python ```python
import datasets import datasets
ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev") ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev")
ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"]) ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"])
eval_rows = ds.to_pandas().to_dict(orient="records") eval_rows = ds.to_pandas().to_dict(orient="records")
@ -43,7 +44,7 @@ system_message = {
client.eval_tasks.register( client.eval_tasks.register(
eval_task_id="meta-reference::mmmu", eval_task_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}", dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"] scoring_functions=["basic::regex_parser_multiple_choice_answer"],
) )
response = client.eval.evaluate_rows( response = client.eval.evaluate_rows(
@ -62,9 +63,9 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096, "max_tokens": 4096,
"repeat_penalty": 1.0, "repeat_penalty": 1.0,
}, },
"system_message": system_message "system_message": system_message,
} },
} },
) )
``` ```
@ -88,7 +89,7 @@ _ = client.datasets.register(
"input_query": {"type": "string"}, "input_query": {"type": "string"},
"expected_answer": {"type": "string"}, "expected_answer": {"type": "string"},
"chat_completion_input": {"type": "chat_completion_input"}, "chat_completion_input": {"type": "chat_completion_input"},
} },
) )
eval_rows = client.datasetio.get_rows_paginated( eval_rows = client.datasetio.get_rows_paginated(
@ -101,7 +102,7 @@ eval_rows = client.datasetio.get_rows_paginated(
client.eval_tasks.register( client.eval_tasks.register(
eval_task_id="meta-reference::simpleqa", eval_task_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id, dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"] scoring_functions=["llm-as-judge::405b-simpleqa"],
) )
response = client.eval.evaluate_rows( response = client.eval.evaluate_rows(
@ -120,8 +121,8 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096, "max_tokens": 4096,
"repeat_penalty": 1.0, "repeat_penalty": 1.0,
}, },
} },
} },
) )
``` ```
@ -144,14 +145,14 @@ agent_config = {
{ {
"type": "brave_search", "type": "brave_search",
"engine": "tavily", "engine": "tavily",
"api_key": userdata.get("TAVILY_SEARCH_API_KEY") "api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
} }
], ],
"tool_choice": "auto", "tool_choice": "auto",
"tool_prompt_format": "json", "tool_prompt_format": "json",
"input_shields": [], "input_shields": [],
"output_shields": [], "output_shields": [],
"enable_session_persistence": False "enable_session_persistence": False,
} }
response = client.eval.evaluate_rows( response = client.eval.evaluate_rows(
@ -163,7 +164,7 @@ response = client.eval.evaluate_rows(
"eval_candidate": { "eval_candidate": {
"type": "agent", "type": "agent",
"config": agent_config, "config": agent_config,
} },
} },
) )
``` ```

View file

@ -13,7 +13,7 @@ Here's how to set up basic evaluation:
response = client.eval_tasks.register( response = client.eval_tasks.register(
eval_task_id="my_eval", eval_task_id="my_eval",
dataset_id="my_dataset", dataset_id="my_dataset",
scoring_functions=["accuracy", "relevance"] scoring_functions=["accuracy", "relevance"],
) )
# Run evaluation # Run evaluation
@ -21,16 +21,10 @@ job = client.eval.run_eval(
task_id="my_eval", task_id="my_eval",
task_config={ task_config={
"type": "app", "type": "app",
"eval_candidate": { "eval_candidate": {"type": "agent", "config": agent_config},
"type": "agent", },
"config": agent_config
}
}
) )
# Get results # Get results
result = client.eval.job_result( result = client.eval.job_result(task_id="my_eval", job_id=job.job_id)
task_id="my_eval",
job_id=job.job_id
)
``` ```

View file

@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a
The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them. The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb) **Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
Here are some key topics that will help you build effective agents: Here are some key topics that will help you build effective agents:

View file

@ -34,15 +34,15 @@ chunks = [
{ {
"document_id": "doc1", "document_id": "doc1",
"content": "Your document text here", "content": "Your document text here",
"mime_type": "text/plain" "mime_type": "text/plain",
}, },
...
] ]
client.vector_io.insert(vector_db_id, chunks) client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
# You can then query for these chunks # You can then query for these chunks
chunks_response = client.vector_io.query(vector_db_id, query="What do you know about...") chunks_response = client.vector_io.query(
vector_db_id=vector_db_id, query="What do you know about..."
)
``` ```
### Using the RAG Tool ### Using the RAG Tool
@ -71,8 +71,8 @@ client.tool_runtime.rag_tool.insert(
# Query documents # Query documents
results = client.tool_runtime.rag_tool.query( results = client.tool_runtime.rag_tool.query(
vector_db_id=vector_db_id, vector_db_ids=[vector_db_id],
query="What do you know about...", content="What do you know about...",
) )
``` ```
@ -81,19 +81,22 @@ results = client.tool_runtime.rag_tool.query(
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example: One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
```python ```python
from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.lib.agents.agent import Agent
# Configure agent with memory # Configure agent with memory
agent_config = AgentConfig( agent_config = AgentConfig(
model="Llama3.2-3B-Instruct", model="meta-llama/Llama-3.2-3B-Instruct",
instructions="You are a helpful assistant", instructions="You are a helpful assistant",
enable_session_persistence=False,
toolgroups=[ toolgroups=[
{ {
"name": "builtin::rag", "name": "builtin::rag",
"args": { "args": {
"vector_db_ids": [vector_db_id], "vector_db_ids": [vector_db_id],
},
} }
} ],
]
) )
agent = Agent(client, agent_config) agent = Agent(client, agent_config)
@ -101,25 +104,21 @@ session_id = agent.create_session("rag_session")
# Initial document ingestion # Initial document ingestion
response = agent.create_turn( response = agent.create_turn(
messages=[{ messages=[
"role": "user", {"role": "user", "content": "I am providing some documents for reference."}
"content": "I am providing some documents for reference."
}],
documents=[
dict(
content="https://raw.githubusercontent.com/example/doc.rst",
mime_type="text/plain"
)
], ],
session_id=session_id documents=[
{
"content": "https://raw.githubusercontent.com/example/doc.rst",
"mime_type": "text/plain",
}
],
session_id=session_id,
) )
# Query with RAG # Query with RAG
response = agent.create_turn( response = agent.create_turn(
messages=[{ messages=[{"role": "user", "content": "What are the key topics in the documents?"}],
"role": "user", session_id=session_id,
"content": "What are the key topics in the documents?"
}],
session_id=session_id
) )
``` ```

View file

@ -5,15 +5,11 @@ Safety is a critical component of any AI application. Llama Stack provides a Shi
```python ```python
# Register a safety shield # Register a safety shield
shield_id = "content_safety" shield_id = "content_safety"
client.shields.register( client.shields.register(shield_id=shield_id, provider_shield_id="llama-guard-basic")
shield_id=shield_id,
provider_shield_id="llama-guard-basic"
)
# Run content through shield # Run content through shield
response = client.safety.run_shield( response = client.safety.run_shield(
shield_id=shield_id, shield_id=shield_id, messages=[{"role": "user", "content": "User message here"}]
messages=[{"role": "user", "content": "User message here"}]
) )
if response.violation: if response.violation:

View file

@ -8,24 +8,16 @@ The telemetry system supports three main types of events:
- **Unstructured Log Events**: Free-form log messages with severity levels - **Unstructured Log Events**: Free-form log messages with severity levels
```python ```python
unstructured_log_event = UnstructuredLogEvent( unstructured_log_event = UnstructuredLogEvent(
message="This is a log message", message="This is a log message", severity=LogSeverity.INFO
severity=LogSeverity.INFO
) )
``` ```
- **Metric Events**: Numerical measurements with units - **Metric Events**: Numerical measurements with units
```python ```python
metric_event = MetricEvent( metric_event = MetricEvent(metric="my_metric", value=10, unit="count")
metric="my_metric",
value=10,
unit="count"
)
``` ```
- **Structured Log Events**: System events like span start/end. Extensible to add more structured log types. - **Structured Log Events**: System events like span start/end. Extensible to add more structured log types.
```python ```python
structured_log_event = SpanStartPayload( structured_log_event = SpanStartPayload(name="my_span", parent_span_id="parent_span_id")
name="my_span",
parent_span_id="parent_span_id"
)
``` ```
### Spans and Traces ### Spans and Traces

View file

@ -35,7 +35,7 @@ Example client SDK call to register a "websearch" toolgroup that is provided by
client.toolgroups.register( client.toolgroups.register(
toolgroup_id="builtin::websearch", toolgroup_id="builtin::websearch",
provider_id="brave-search", provider_id="brave-search",
args={"max_results": 5} args={"max_results": 5},
) )
``` ```
@ -50,8 +50,7 @@ The Code Interpreter allows execution of Python code within a controlled environ
```python ```python
# Register Code Interpreter tool group # Register Code Interpreter tool group
client.toolgroups.register( client.toolgroups.register(
toolgroup_id="builtin::code_interpreter", toolgroup_id="builtin::code_interpreter", provider_id="code_interpreter"
provider_id="code_interpreter"
) )
``` ```
@ -68,16 +67,14 @@ The WolframAlpha tool provides access to computational knowledge through the Wol
```python ```python
# Register WolframAlpha tool group # Register WolframAlpha tool group
client.toolgroups.register( client.toolgroups.register(
toolgroup_id="builtin::wolfram_alpha", toolgroup_id="builtin::wolfram_alpha", provider_id="wolfram-alpha"
provider_id="wolfram-alpha"
) )
``` ```
Example usage: Example usage:
```python ```python
result = client.tool_runtime.invoke_tool( result = client.tool_runtime.invoke_tool(
tool_name="wolfram_alpha", tool_name="wolfram_alpha", args={"query": "solve x^2 + 2x + 1 = 0"}
args={"query": "solve x^2 + 2x + 1 = 0"}
) )
``` ```
@ -90,10 +87,7 @@ The Memory tool enables retrieval of context from various types of memory banks
client.toolgroups.register( client.toolgroups.register(
toolgroup_id="builtin::memory", toolgroup_id="builtin::memory",
provider_id="memory", provider_id="memory",
args={ args={"max_chunks": 5, "max_tokens_in_context": 4096},
"max_chunks": 5,
"max_tokens_in_context": 4096
}
) )
``` ```
@ -136,9 +130,7 @@ config = AgentConfig(
toolgroups=[ toolgroups=[
"builtin::websearch", "builtin::websearch",
], ],
client_tools=[ client_tools=[ToolDef(name="client_tool", description="Client provided tool")],
ToolDef(name="client_tool", description="Client provided tool")
]
) )
``` ```
@ -167,9 +159,9 @@ Example tool definition:
"name": "query", "name": "query",
"parameter_type": "string", "parameter_type": "string",
"description": "The query to search for", "description": "The query to search for",
"required": True "required": True,
} }
] ],
} }
``` ```
@ -179,8 +171,7 @@ Tools can be invoked using the `invoke_tool` method:
```python ```python
result = client.tool_runtime.invoke_tool( result = client.tool_runtime.invoke_tool(
tool_name="web_search", tool_name="web_search", kwargs={"query": "What is the capital of France?"}
kwargs={"query": "What is the capital of France?"}
) )
``` ```

View file

@ -62,10 +62,3 @@ While there is a lot of flexibility to mix-and-match providers, often users will
**On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.) **On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.)
```{toctree}
:maxdepth: 1
:hidden:
distributions/index
```

View file

@ -68,6 +68,7 @@ myst_substitutions = {
"docker_hub": "https://hub.docker.com/repository/docker/llamastack", "docker_hub": "https://hub.docker.com/repository/docker/llamastack",
} }
suppress_warnings = ['myst.header']
# Copy button settings # Copy button settings
copybutton_prompt_text = "$ " # for bash prompts copybutton_prompt_text = "$ " # for bash prompts
@ -94,22 +95,6 @@ html_static_path = ["../_static"]
# html_logo = "../_static/llama-stack-logo.png" # html_logo = "../_static/llama-stack-logo.png"
html_style = "../_static/css/my_theme.css" html_style = "../_static/css/my_theme.css"
redoc = [
{
"name": "Llama Stack API",
"page": "references/api_reference/index",
"spec": "../resources/llama-stack-spec.yaml",
"opts": {
"suppress-warnings": True,
# "expand-responses": ["200", "201"],
},
"embed": True,
},
]
redoc_uri = "https://cdn.redoc.ly/redoc/latest/bundles/redoc.standalone.js"
def setup(app): def setup(app):
def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]): def dockerhub_role(name, rawtext, text, lineno, inliner, options={}, content=[]):
url = f"https://hub.docker.com/r/llamastack/{text}" url = f"https://hub.docker.com/r/llamastack/{text}"

View file

@ -3,7 +3,7 @@
This guide will walk you through the process of adding a new API provider to Llama Stack. This guide will walk you through the process of adding a new API provider to Llama Stack.
- Begin by reviewing the [core concepts](../concepts/) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.) - Begin by reviewing the [core concepts](../concepts/index.md) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
- Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally. - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary. - Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary. - Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary.

View file

@ -180,12 +180,45 @@ After this step is successful, you should be able to find the built container im
### Running your Stack server ### Running your Stack server
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step. Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
```
llama stack run -h
usage: llama stack run [-h] [--port PORT] [--image-name IMAGE_NAME] [--disable-ipv6] [--env KEY=VALUE] [--tls-keyfile TLS_KEYFILE]
[--tls-certfile TLS_CERTFILE] [--image-type {conda,container,venv}]
config
start the server for a Llama Stack Distribution. You should have already built (or downloaded) and configured the distribution.
positional arguments:
config Path to config file to use for the run
options:
-h, --help show this help message and exit
--port PORT Port to run the server on. Defaults to 8321
--image-name IMAGE_NAME
Name of the image to run. Defaults to the current conda environment
--disable-ipv6 Disable IPv6 support
--env KEY=VALUE Environment variables to pass to the server in KEY=VALUE format. Can be specified multiple times.
--tls-keyfile TLS_KEYFILE
Path to TLS key file for HTTPS
--tls-certfile TLS_CERTFILE
Path to TLS certificate file for HTTPS
--image-type {conda,container,venv}
Image Type used during the build. This can be either conda or container or venv.
```
``` ```
# Start using template name # Start using template name
llama stack run tgi llama stack run tgi
# Start using config file # Start using config file
llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
# Start using a venv
llama stack run --image-type venv ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
# Start using a conda environment
llama stack run --image-type conda ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
``` ```
``` ```

View file

@ -1,9 +1,9 @@
# Using Llama Stack as a Library # Using Llama Stack as a Library
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server. If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
```python ```bash
# setup # setup
pip install llama-stack uv pip install llama-stack
llama stack build --template together --image-type venv llama stack build --template together --image-type venv
``` ```
@ -13,7 +13,7 @@ from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient( client = LlamaStackAsLibraryClient(
"ollama", "ollama",
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here. # provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
provider_data = {"tavily_search_api_key": os.environ['TAVILY_SEARCH_API_KEY']} provider_data={"tavily_search_api_key": os.environ["TAVILY_SEARCH_API_KEY"]},
) )
await client.initialize() await client.initialize()
``` ```

View file

@ -7,14 +7,19 @@ You can run a Llama Stack server in one of the following ways:
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library) This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
**Docker**: **Container**:
Another simple way to start interacting with Llama Stack is to just spin up docker which is pre-built with all the providers you need. We provide a number of pre-built Docker containers so you can start a Llama Stack server instantly. You can also build your own custom Docker container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](distributions/selection) for more details. Another simple way to start interacting with Llama Stack is to just spin up a container (via Docker or Podman) which is pre-built with all the providers you need. We provide a number of pre-built images so you can start a Llama Stack server instantly. You can also build your own custom container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](selection) for more details.
**Conda**: **Conda**:
Lastly, if you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details. If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
**Kubernetes**:
If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
```{toctree} ```{toctree}
@ -24,4 +29,6 @@ Lastly, if you have a custom or an advanced setup or you are developing on Llama
importing_as_library importing_as_library
building_distro building_distro
configuration configuration
selection
kubernetes_deployment
``` ```

View file

@ -0,0 +1,207 @@
# Kubernetes Deployment Guide
Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
First, create a local Kubernetes cluster via Kind:
```bash
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
```
Start vLLM server as a Kubernetes Pod and Service:
```bash
cat <<EOF |kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: vllm-models
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
resources:
requests:
storage: 50Gi
---
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
data:
token: $(HF_TOKEN)
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm
template:
metadata:
labels:
app.kubernetes.io/name: vllm
spec:
containers:
- name: llama-stack
image: $(VLLM_IMAGE)
command:
- bash
- -c
- |
MODEL="meta-llama/Llama-3.2-1B-Instruct"
MODEL_PATH=/app/model/$(basename $MODEL)
huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /app/model
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server
spec:
selector:
app.kubernetes.io/name: vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
EOF
```
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
```bash
$ kubectl logs -l app.kubernetes.io/name=vllm
...
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
```
Then we can modify the Llama Stack run configuration YAML with the following inference provider:
```yaml
providers:
inference:
- provider_id: vllm
provider_type: remote::vllm
config:
url: http://vllm-server.default.svc.cluster.local:8000/v1
max_tokens: 4096
api_token: fake
```
Once we have defined the run configuration for Llama Stack, we can build an image with that configuration and the server source code:
```bash
cat >/tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s <<EOF
FROM distribution-myenv:dev
RUN apt-get update && apt-get install -y git
RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
EOF
podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
```
We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
```bash
cat <<EOF |kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llama-pvc
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-stack-server
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: llama-stack
template:
metadata:
labels:
app.kubernetes.io/name: llama-stack
spec:
containers:
- name: llama-stack
image: localhost/llama-stack-run-k8s:latest
imagePullPolicy: IfNotPresent
command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
ports:
- containerPort: 5000
volumeMounts:
- name: llama-storage
mountPath: /root/.llama
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: llama-pvc
---
apiVersion: v1
kind: Service
metadata:
name: llama-stack-service
spec:
selector:
app.kubernetes.io/name: llama-stack
ports:
- protocol: TCP
port: 5000
targetPort: 5000
type: ClusterIP
EOF
```
We can check that the LlamaStack server has started:
```bash
$ kubectl logs -l app.kubernetes.io/name=llama-stack
...
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: ASGI 'lifespan' protocol appears unsupported.
INFO: Application startup complete.
INFO: Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
```
Finally, we forward the Kubernetes service to a local port and test some inference requests against it via the Llama Stack Client:
```bash
kubectl port-forward service/llama-stack-service 5000:5000
llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
```

View file

@ -1,3 +1,4 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# NVIDIA Distribution # NVIDIA Distribution
The `llamastack/distribution-nvidia` distribution consists of the following provider configurations. The `llamastack/distribution-nvidia` distribution consists of the following provider configurations.

View file

@ -23,7 +23,7 @@ Which templates / distributions to choose depends on the hardware you have for r
- {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together)) - {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together))
- {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks)) - {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks))
- **Do you want to run Llama Stack inference on your iOS / Android device** Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device: - **Do you want to run Llama Stack inference on your iOS / Android device?** Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device:
- [iOS SDK](ondevice_distro/ios_sdk) - [iOS SDK](ondevice_distro/ios_sdk)
- [Android](ondevice_distro/android_sdk) - [Android](ondevice_distro/android_sdk)
@ -43,7 +43,6 @@ self_hosted_distro/nvidia
self_hosted_distro/ollama self_hosted_distro/ollama
self_hosted_distro/together self_hosted_distro/together
self_hosted_distro/fireworks self_hosted_distro/fireworks
ondevice_distro/index
``` ```
### On-Device Distributions ### On-Device Distributions

View file

@ -1,3 +1,4 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Bedrock Distribution # Bedrock Distribution
```{toctree} ```{toctree}

View file

@ -1,3 +1,4 @@
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Cerebras Distribution # Cerebras Distribution
The `llamastack/distribution-cerebras` distribution consists of the following provider configurations. The `llamastack/distribution-cerebras` distribution consists of the following provider configurations.

View file

@ -0,0 +1,186 @@
---
orphan: true
---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Dell Distribution of Llama Stack
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-dell` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::tgi` |
| safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
### Environment Variables
The following environment variables can be configured:
- `DEH_URL`: URL for the Dell inference server (default: `http://0.0.0.0:8181`)
- `DEH_SAFETY_URL`: URL for the Dell safety inference server (default: `http://0.0.0.0:8282`)
- `CHROMA_URL`: URL for the Chroma server (default: `http://localhost:6601`)
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
```bash
export INFERENCE_PORT=8181
export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
export CHROMADB_HOST=localhost
export CHROMADB_PORT=6601
export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
export CUDA_VISIBLE_DEVICES=0
export LLAMA_STACK_PORT=8321
docker run --rm -it \
--network host \
-v $HOME/.cache/huggingface:/data \
-e HF_TOKEN=$HF_TOKEN \
-p $INFERENCE_PORT:$INFERENCE_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--cuda-memory-fraction 0.7 \
--model-id $INFERENCE_MODEL \
--port $INFERENCE_PORT --hostname 0.0.0.0
```
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
```bash
export SAFETY_INFERENCE_PORT=8282
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export CUDA_VISIBLE_DEVICES=1
docker run --rm -it \
--network host \
-v $HOME/.cache/huggingface:/data \
-e HF_TOKEN=$HF_TOKEN \
-p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
--gpus $CUDA_VISIBLE_DEVICES \
ghcr.io/huggingface/text-generation-inference \
--dtype bfloat16 \
--usage-stats off \
--sharded false \
--cuda-memory-fraction 0.7 \
--model-id $SAFETY_MODEL \
--hostname 0.0.0.0 \
--port $SAFETY_INFERENCE_PORT
```
## Dell distribution relies on ChromaDB for vector database usage
You can start a chroma-db easily using docker.
```bash
# This is where the indices are persisted
mkdir -p $HOME/chromadb
podman run --rm -it \
--network host \
--name chromadb \
-v $HOME/chromadb:/chroma/chroma \
-e IS_PERSISTENT=TRUE \
chromadb/chroma:latest \
--port $CHROMADB_PORT \
--host $CHROMADB_HOST
```
## Running Llama Stack
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
docker run -it \
--network host \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v $HOME/.llama:/root/.llama \
# NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed
-v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \
# localhost/distribution-dell:dev if building / testing locally
llamastack/distribution-dell\
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
# You need a local checkout of llama-stack to run this, get it using
# git clone https://github.com/meta-llama/llama-stack.git
cd /path/to/llama-stack
export SAFETY_INFERENCE_PORT=8282
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v $HOME/.llama:/root/.llama \
-v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-dell \
--yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
```
### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash
llama stack build --template dell --image-type conda
llama stack run dell
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env CHROMA_URL=$CHROMA_URL
```
If you are using Llama Stack Safety / Shield APIs, use:
```bash
llama stack run ./run-with-safety.yaml \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env DEH_URL=$DEH_URL \
--env SAFETY_MODEL=$SAFETY_MODEL \
--env DEH_SAFETY_URL=$DEH_SAFETY_URL \
--env CHROMA_URL=$CHROMA_URL
```

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Fireworks Distribution # Fireworks Distribution
```{toctree} ```{toctree}

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Meta Reference Distribution # Meta Reference Distribution
```{toctree} ```{toctree}
@ -82,7 +83,7 @@ docker run \
### Via Conda ### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
llama stack build --template meta-reference-gpu --image-type conda llama stack build --template meta-reference-gpu --image-type conda

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Meta Reference Quantized Distribution # Meta Reference Quantized Distribution
```{toctree} ```{toctree}
@ -82,7 +83,7 @@ docker run \
### Via Conda ### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
llama stack build --template meta-reference-quantized-gpu --image-type conda llama stack build --template meta-reference-quantized-gpu --image-type conda

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Ollama Distribution # Ollama Distribution
```{toctree} ```{toctree}
@ -25,7 +26,9 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
### Environment Variables
The following environment variables can be configured: The following environment variables can be configured:
@ -101,7 +104,7 @@ docker run \
### Via Conda ### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
export LLAMA_STACK_PORT=5001 export LLAMA_STACK_PORT=5001

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Remote vLLM Distribution # Remote vLLM Distribution
```{toctree} ```{toctree}
:maxdepth: 2 :maxdepth: 2
@ -131,7 +132,7 @@ docker run \
### Via Conda ### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
export INFERENCE_PORT=8000 export INFERENCE_PORT=8000

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# SambaNova Distribution # SambaNova Distribution
```{toctree} ```{toctree}
@ -38,13 +39,15 @@ The following models are available by default:
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (Meta-Llama-3.1-405B-Instruct)` - `meta-llama/Llama-3.1-405B-Instruct-FP8 (Meta-Llama-3.1-405B-Instruct)`
- `meta-llama/Llama-3.2-1B-Instruct (Meta-Llama-3.2-1B-Instruct)` - `meta-llama/Llama-3.2-1B-Instruct (Meta-Llama-3.2-1B-Instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (Meta-Llama-3.2-3B-Instruct)` - `meta-llama/Llama-3.2-3B-Instruct (Meta-Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.3-70B-Instruct (Meta-Llama-3.3-70B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (Llama-3.2-11B-Vision-Instruct)` - `meta-llama/Llama-3.2-11B-Vision-Instruct (Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (Llama-3.2-90B-Vision-Instruct)` - `meta-llama/Llama-3.2-90B-Vision-Instruct (Llama-3.2-90B-Vision-Instruct)`
- `meta-llama/Llama-Guard-3-8B (Meta-Llama-Guard-3-8B)`
### Prerequisite: API Keys ### Prerequisite: API Keys
Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaBova.ai](https://sambanova.ai/). Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](https://sambanova.ai/).
## Running Llama Stack with SambaNova ## Running Llama Stack with SambaNova

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# TGI Distribution # TGI Distribution
@ -122,7 +123,7 @@ docker run \
### Via Conda ### Via Conda
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
llama stack build --template tgi --image-type conda llama stack build --template tgi --image-type conda

View file

@ -1,6 +1,7 @@
--- ---
orphan: true orphan: true
--- ---
<!-- This file was auto-generated by distro_codegen.py, please edit source -->
# Together Distribution # Together Distribution
```{toctree} ```{toctree}

View file

@ -15,8 +15,11 @@ ollama run llama3.2:3b-instruct-fp16 --keepalive 60m
By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime. By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime.
NOTE: If you do not have ollama, you can install it from [here](https://ollama.ai/docs/installation). ```{admonition} Note
:class: tip
If you do not have ollama, you can install it from [here](https://ollama.com/download).
```
### 2. Pick a client environment ### 2. Pick a client environment
@ -35,15 +38,20 @@ The API is **exactly identical** for both clients.
:::{dropdown} Starting up the Llama Stack server :::{dropdown} Starting up the Llama Stack server
The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc. The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.
To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image. To get started quickly, we provide various container images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the container image.
Lets setup some environment variables that we will use in the rest of the guide. Lets setup some environment variables that we will use in the rest of the guide.
```bash ```bash
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
LLAMA_STACK_PORT=8321 export LLAMA_STACK_PORT=8321
``` ```
You can start the server using the following command: Next you can create a local directory to mount into the containers file system.
```bash
mkdir -p ~/.llama
```
Then you can start the server using the container tool of your choice. For example, if you are running Docker you can use the following command:
```bash ```bash
docker run -it \ docker run -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -53,8 +61,28 @@ docker run -it \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434 --env OLLAMA_URL=http://host.docker.internal:11434
``` ```
As another example, to start the container with Podman, you can do the same but replace `docker` at the start of the command with `podman`. If you are using `podman` older than `4.7.0`, please also replace `host.docker.internal` in the `OLLAMA_URL` with `host.containers.internal`.
Configuration for this is available at `distributions/ollama/run.yaml`. Configuration for this is available at `distributions/ollama/run.yaml`.
```{admonition} Note
:class: note
Docker containers run in their own isolated network namespaces on Linux. To allow the container to communicate with services running on the host via `localhost`, you need `--network=host`. This makes the container use the hosts network directly so it can connect to Ollama running on `localhost:11434`.
Linux users having issues running the above command should instead try the following:
```bash
docker run -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \
--network=host \
llamastack/distribution-ollama \
--port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://localhost:11434
```
::: :::
@ -71,8 +99,10 @@ pip install llama-stack-client
Let's use the `llama-stack-client` CLI to check the connectivity to the server. Let's use the `llama-stack-client` CLI to check the connectivity to the server.
```bash ```bash
llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT $ llama-stack-client configure --endpoint http://localhost:$LLAMA_STACK_PORT
llama-stack-client models list > Enter the API key (leave empty if no key is needed):
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
$ llama-stack-client models list
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ identifier ┃ provider_id ┃ provider_resource_id ┃ metadata ┃ ┃ identifier ┃ provider_id ┃ provider_resource_id ┃ metadata ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
@ -95,19 +125,30 @@ llama-stack-client \
Here is a simple example to perform chat completions using the SDK. Here is a simple example to perform chat completions using the SDK.
```python ```python
import os import os
import sys
def create_http_client(): def create_http_client():
from llama_stack_client import LlamaStackClient from llama_stack_client import LlamaStackClient
return LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
return LlamaStackClient(
base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
)
def create_library_client(template="ollama"): def create_library_client(template="ollama"):
from llama_stack import LlamaStackAsLibraryClient from llama_stack import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient(template) client = LlamaStackAsLibraryClient(template)
client.initialize() if not client.initialize():
print("llama stack not built properly")
sys.exit(1)
return client return client
client = create_library_client() # or create_http_client() depending on the environment you picked client = (
create_library_client()
) # or create_http_client() depending on the environment you picked
# List available models # List available models
models = client.models.list() models = client.models.list()
@ -120,8 +161,8 @@ response = client.inference.chat_completion(
model_id=os.environ["INFERENCE_MODEL"], model_id=os.environ["INFERENCE_MODEL"],
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a haiku about coding"} {"role": "user", "content": "Write a haiku about coding"},
] ],
) )
print(response.completion_message.content) print(response.completion_message.content)
``` ```
@ -132,6 +173,7 @@ Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agen
```python ```python
import os import os
import uuid
from termcolor import cprint from termcolor import cprint
from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.agent import Agent
@ -139,10 +181,29 @@ from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types.agent_create_params import AgentConfig from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.types import Document from llama_stack_client.types import Document
client = create_library_client() # or create_http_client() depending on the environment you picked
def create_http_client():
from llama_stack_client import LlamaStackClient
return LlamaStackClient(
base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}"
)
def create_library_client(template="ollama"):
from llama_stack import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient(template)
client.initialize()
return client
client = (
create_library_client()
) # or create_http_client() depending on the environment you picked
# Documents to be used for RAG # Documents to be used for RAG
urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"] urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
documents = [ documents = [
Document( Document(
document_id=f"num-{i}", document_id=f"num-{i}",
@ -154,7 +215,7 @@ documents = [
] ]
# Register a vector database # Register a vector database
vector_db_id = "test-vector-db" vector_db_id = f"test-vector-db-{uuid.uuid4().hex}"
client.vector_dbs.register( client.vector_dbs.register(
vector_db_id=vector_db_id, vector_db_id=vector_db_id,
embedding_model="all-MiniLM-L6-v2", embedding_model="all-MiniLM-L6-v2",
@ -179,7 +240,7 @@ agent_config = AgentConfig(
"name": "builtin::rag", "name": "builtin::rag",
"args": { "args": {
"vector_db_ids": [vector_db_id], "vector_db_ids": [vector_db_id],
} },
} }
], ],
) )
@ -193,7 +254,7 @@ user_prompts = [
# Run the agent loop by calling the `create_turn` method # Run the agent loop by calling the `create_turn` method
for prompt in user_prompts: for prompt in user_prompts:
cprint(f'User> {prompt}', 'green') cprint(f"User> {prompt}", "green")
response = rag_agent.create_turn( response = rag_agent.create_turn(
messages=[{"role": "user", "content": prompt}], messages=[{"role": "user", "content": prompt}],
session_id=session_id, session_id=session_id,

View file

@ -1,7 +1,8 @@
```{admonition} News ```{admonition} News
:class: tip :class: tip
Llama Stack 0.1.0 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.0) for more details. Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details.
``` ```
# Llama Stack # Llama Stack

View file

@ -1,7 +1,6 @@
{.hide-title}
# API Reference # API Reference
```{eval-rst} ```{raw} html
.. sphinxcontrib-redoc:: ../resources/llama-stack-spec.yaml :file: ../../../_static/llama-stack-spec.html
:page-title: API Reference
:expand-responses: all
``` ```

View file

@ -12,7 +12,7 @@ This guide goes over the sets of APIs and developer experience flow of using Lla
## Evaluation Concepts ## Evaluation Concepts
The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../concepts/index.md) guide for better high-level understanding. The Evaluation APIs are associated with a set of Resources as shown in the following diagram. Please visit the Resources section in our [Core Concepts](../../concepts/index.md) guide for better high-level understanding.
![Eval Concepts](./resources/eval-concept.png) ![Eval Concepts](./resources/eval-concept.png)
@ -51,6 +51,7 @@ This first example walks you through how to evaluate a model candidate served by
```python ```python
import datasets import datasets
ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev") ds = datasets.load_dataset(path="llamastack/mmmu", name="Agriculture", split="dev")
ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"]) ds = ds.select_columns(["chat_completion_input", "input_query", "expected_answer"])
eval_rows = ds.to_pandas().to_dict(orient="records") eval_rows = ds.to_pandas().to_dict(orient="records")
@ -79,7 +80,7 @@ system_message = {
client.eval_tasks.register( client.eval_tasks.register(
eval_task_id="meta-reference::mmmu", eval_task_id="meta-reference::mmmu",
dataset_id=f"mmmu-{subset}-{split}", dataset_id=f"mmmu-{subset}-{split}",
scoring_functions=["basic::regex_parser_multiple_choice_answer"] scoring_functions=["basic::regex_parser_multiple_choice_answer"],
) )
response = client.eval.evaluate_rows( response = client.eval.evaluate_rows(
@ -98,9 +99,9 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096, "max_tokens": 4096,
"repeat_penalty": 1.0, "repeat_penalty": 1.0,
}, },
"system_message": system_message "system_message": system_message,
} },
} },
) )
``` ```
@ -124,7 +125,7 @@ _ = client.datasets.register(
"input_query": {"type": "string"}, "input_query": {"type": "string"},
"expected_answer": {"type": "string"}, "expected_answer": {"type": "string"},
"chat_completion_input": {"type": "chat_completion_input"}, "chat_completion_input": {"type": "chat_completion_input"},
} },
) )
eval_rows = client.datasetio.get_rows_paginated( eval_rows = client.datasetio.get_rows_paginated(
@ -137,7 +138,7 @@ eval_rows = client.datasetio.get_rows_paginated(
client.eval_tasks.register( client.eval_tasks.register(
eval_task_id="meta-reference::simpleqa", eval_task_id="meta-reference::simpleqa",
dataset_id=simpleqa_dataset_id, dataset_id=simpleqa_dataset_id,
scoring_functions=["llm-as-judge::405b-simpleqa"] scoring_functions=["llm-as-judge::405b-simpleqa"],
) )
response = client.eval.evaluate_rows( response = client.eval.evaluate_rows(
@ -156,8 +157,8 @@ response = client.eval.evaluate_rows(
"max_tokens": 4096, "max_tokens": 4096,
"repeat_penalty": 1.0, "repeat_penalty": 1.0,
}, },
} },
} },
) )
``` ```
@ -180,14 +181,14 @@ agent_config = {
{ {
"type": "brave_search", "type": "brave_search",
"engine": "tavily", "engine": "tavily",
"api_key": userdata.get("TAVILY_SEARCH_API_KEY") "api_key": userdata.get("TAVILY_SEARCH_API_KEY"),
} }
], ],
"tool_choice": "auto", "tool_choice": "auto",
"tool_prompt_format": "json", "tool_prompt_format": "json",
"input_shields": [], "input_shields": [],
"output_shields": [], "output_shields": [],
"enable_session_persistence": False "enable_session_persistence": False,
} }
response = client.eval.evaluate_rows( response = client.eval.evaluate_rows(
@ -199,8 +200,8 @@ response = client.eval.evaluate_rows(
"eval_candidate": { "eval_candidate": {
"type": "agent", "type": "agent",
"config": agent_config, "config": agent_config,
} },
} },
) )
``` ```
@ -237,7 +238,9 @@ GENERATED_RESPONSE: {generated_answer}
EXPECTED_RESPONSE: {expected_answer} EXPECTED_RESPONSE: {expected_answer}
""" """
input_query = "What are the top 5 topics that were explained? Only list succinct bullet points." input_query = (
"What are the top 5 topics that were explained? Only list succinct bullet points."
)
generated_answer = """ generated_answer = """
Here are the top 5 topics that were explained in the documentation for Torchtune: Here are the top 5 topics that were explained in the documentation for Torchtune:
@ -268,7 +271,9 @@ scoring_params = {
"braintrust::factuality": None, "braintrust::factuality": None,
} }
response = client.scoring.score(input_rows=dataset_rows, scoring_functions=scoring_params) response = client.scoring.score(
input_rows=dataset_rows, scoring_functions=scoring_params
)
``` ```
## Running Evaluations via CLI ## Running Evaluations via CLI

View file

@ -33,7 +33,11 @@ from llama_stack_client.types import (
Types: Types:
```python ```python
from llama_stack_client.types import ListToolGroupsResponse, ToolGroup, ToolgroupListResponse from llama_stack_client.types import (
ListToolGroupsResponse,
ToolGroup,
ToolgroupListResponse,
)
``` ```
Methods: Methods:
@ -444,7 +448,11 @@ Methods:
Types: Types:
```python ```python
from llama_stack_client.types import EvalTask, ListEvalTasksResponse, EvalTaskListResponse from llama_stack_client.types import (
EvalTask,
ListEvalTasksResponse,
EvalTaskListResponse,
)
``` ```
Methods: Methods:

View file

@ -45,7 +45,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
--- ---
## Install Dependencies and Set Up Environmen ## Install Dependencies and Set Up Environment
1. **Create a Conda Environment**: 1. **Create a Conda Environment**:
Create a new Conda environment with Python 3.10: Create a new Conda environment with Python 3.10:
@ -73,7 +73,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
Open a new terminal and install `llama-stack`: Open a new terminal and install `llama-stack`:
```bash ```bash
conda activate ollama conda activate ollama
pip install llama-stack==0.0.61 pip install llama-stack==0.1.0
``` ```
--- ---
@ -191,7 +191,7 @@ You can check the available models with the command `llama-stack-client models l
You can also interact with the Llama Stack server using a simple Python script. Below is an example: You can also interact with the Llama Stack server using a simple Python script. Below is an example:
### 1. Activate Conda Environmen ### 1. Activate Conda Environment
```bash ```bash
conda activate ollama conda activate ollama
@ -208,7 +208,7 @@ In `test_llama_stack.py`, write the following code:
```python ```python
import os import os
from llama_stack_client import LlamaStackClien from llama_stack_client import LlamaStackClient
# Get the model ID from the environment variable # Get the model ID from the environment variable
INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL") INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL")
@ -224,7 +224,7 @@ client = LlamaStackClient(base_url="http://localhost:5001")
response = client.inference.chat_completion( response = client.inference.chat_completion(
messages=[ messages=[
{"role": "system", "content": "You are a friendly assistant."}, {"role": "system", "content": "You are a friendly assistant."},
{"role": "user", "content": "Write a two-sentence poem about llama."} {"role": "user", "content": "Write a two-sentence poem about llama."},
], ],
model_id=INFERENCE_MODEL, model_id=INFERENCE_MODEL,
) )

View file

@ -15,20 +15,21 @@ from typing import (
Literal, Literal,
Optional, Optional,
Protocol, Protocol,
runtime_checkable,
Union, Union,
runtime_checkable,
) )
from llama_models.schema_utils import json_schema_type, register_schema, webmethod from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent, URL from llama_stack.apis.common.content_types import URL, ContentDelta, InterleavedContent
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
CompletionMessage, CompletionMessage,
ResponseFormat, ResponseFormat,
SamplingParams, SamplingParams,
ToolCall, ToolCall,
ToolChoice, ToolChoice,
ToolConfig,
ToolPromptFormat, ToolPromptFormat,
ToolResponse, ToolResponse,
ToolResponseMessage, ToolResponseMessage,
@ -86,9 +87,7 @@ class ShieldCallStep(StepCommon):
@json_schema_type @json_schema_type
class MemoryRetrievalStep(StepCommon): class MemoryRetrievalStep(StepCommon):
step_type: Literal[StepType.memory_retrieval.value] = ( step_type: Literal[StepType.memory_retrieval.value] = StepType.memory_retrieval.value
StepType.memory_retrieval.value
)
vector_db_ids: str vector_db_ids: str
inserted_context: InterleavedContent inserted_context: InterleavedContent
@ -118,7 +117,7 @@ class Turn(BaseModel):
] ]
steps: List[Step] steps: List[Step]
output_message: CompletionMessage output_message: CompletionMessage
output_attachments: List[Attachment] = Field(default_factory=list) output_attachments: Optional[List[Attachment]] = Field(default_factory=list)
started_at: datetime started_at: datetime
completed_at: Optional[datetime] = None completed_at: Optional[datetime] = None
@ -155,10 +154,25 @@ class AgentConfigCommon(BaseModel):
output_shields: Optional[List[str]] = Field(default_factory=list) output_shields: Optional[List[str]] = Field(default_factory=list)
toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list) toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list)
client_tools: Optional[List[ToolDef]] = Field(default_factory=list) client_tools: Optional[List[ToolDef]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto) tool_choice: Optional[ToolChoice] = Field(default=None, deprecated="use tool_config instead")
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None) tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None, deprecated="use tool_config instead")
tool_config: Optional[ToolConfig] = Field(default=None)
max_infer_iters: int = 10 max_infer_iters: Optional[int] = 10
def model_post_init(self, __context):
if self.tool_config:
if self.tool_choice and self.tool_config.tool_choice != self.tool_choice:
raise ValueError("tool_choice is deprecated. Use tool_choice in tool_config instead.")
if self.tool_prompt_format and self.tool_config.tool_prompt_format != self.tool_prompt_format:
raise ValueError("tool_prompt_format is deprecated. Use tool_prompt_format in tool_config instead.")
else:
params = {}
if self.tool_choice:
params["tool_choice"] = self.tool_choice
if self.tool_prompt_format:
params["tool_prompt_format"] = self.tool_prompt_format
self.tool_config = ToolConfig(**params)
@json_schema_type @json_schema_type
@ -184,9 +198,7 @@ class AgentTurnResponseEventType(Enum):
@json_schema_type @json_schema_type
class AgentTurnResponseStepStartPayload(BaseModel): class AgentTurnResponseStepStartPayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.step_start.value] = ( event_type: Literal[AgentTurnResponseEventType.step_start.value] = AgentTurnResponseEventType.step_start.value
AgentTurnResponseEventType.step_start.value
)
step_type: StepType step_type: StepType
step_id: str step_id: str
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict) metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
@ -194,9 +206,7 @@ class AgentTurnResponseStepStartPayload(BaseModel):
@json_schema_type @json_schema_type
class AgentTurnResponseStepCompletePayload(BaseModel): class AgentTurnResponseStepCompletePayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.step_complete.value] = ( event_type: Literal[AgentTurnResponseEventType.step_complete.value] = AgentTurnResponseEventType.step_complete.value
AgentTurnResponseEventType.step_complete.value
)
step_type: StepType step_type: StepType
step_id: str step_id: str
step_details: Step step_details: Step
@ -206,9 +216,7 @@ class AgentTurnResponseStepCompletePayload(BaseModel):
class AgentTurnResponseStepProgressPayload(BaseModel): class AgentTurnResponseStepProgressPayload(BaseModel):
model_config = ConfigDict(protected_namespaces=()) model_config = ConfigDict(protected_namespaces=())
event_type: Literal[AgentTurnResponseEventType.step_progress.value] = ( event_type: Literal[AgentTurnResponseEventType.step_progress.value] = AgentTurnResponseEventType.step_progress.value
AgentTurnResponseEventType.step_progress.value
)
step_type: StepType step_type: StepType
step_id: str step_id: str
@ -217,17 +225,13 @@ class AgentTurnResponseStepProgressPayload(BaseModel):
@json_schema_type @json_schema_type
class AgentTurnResponseTurnStartPayload(BaseModel): class AgentTurnResponseTurnStartPayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.turn_start.value] = ( event_type: Literal[AgentTurnResponseEventType.turn_start.value] = AgentTurnResponseEventType.turn_start.value
AgentTurnResponseEventType.turn_start.value
)
turn_id: str turn_id: str
@json_schema_type @json_schema_type
class AgentTurnResponseTurnCompletePayload(BaseModel): class AgentTurnResponseTurnCompletePayload(BaseModel):
event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = ( event_type: Literal[AgentTurnResponseEventType.turn_complete.value] = AgentTurnResponseEventType.turn_complete.value
AgentTurnResponseEventType.turn_complete.value
)
turn: Turn turn: Turn
@ -280,6 +284,7 @@ class AgentTurnCreateRequest(AgentConfigOverridablePerTurn):
toolgroups: Optional[List[AgentToolGroup]] = None toolgroups: Optional[List[AgentToolGroup]] = None
stream: Optional[bool] = False stream: Optional[bool] = False
tool_config: Optional[ToolConfig] = None
@json_schema_type @json_schema_type
@ -297,6 +302,16 @@ class AgentStepResponse(BaseModel):
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class Agents(Protocol): class Agents(Protocol):
"""Agents API for creating and interacting with agentic systems.
Main functionalities provided by this API:
- Create agents with specific instructions and ability to use tools.
- Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
- Agents can be provided with various shields (see the Safety API for more details).
- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
"""
@webmethod(route="/agents", method="POST") @webmethod(route="/agents", method="POST")
async def create_agent( async def create_agent(
self, self,
@ -317,10 +332,12 @@ class Agents(Protocol):
stream: Optional[bool] = False, stream: Optional[bool] = False,
documents: Optional[List[Document]] = None, documents: Optional[List[Document]] = None,
toolgroups: Optional[List[AgentToolGroup]] = None, toolgroups: Optional[List[AgentToolGroup]] = None,
tool_config: Optional[ToolConfig] = None,
) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ... ) -> Union[Turn, AsyncIterator[AgentTurnResponseStreamChunk]]: ...
@webmethod( @webmethod(
route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}", method="GET" route="/agents/{agent_id}/session/{session_id}/turn/{turn_id}",
method="GET",
) )
async def get_agents_turn( async def get_agents_turn(
self, self,

View file

@ -13,7 +13,6 @@ from termcolor import cprint
from llama_stack.apis.agents import AgentTurnResponseEventType, StepType from llama_stack.apis.agents import AgentTurnResponseEventType, StepType
from llama_stack.apis.common.content_types import ToolCallParseStatus from llama_stack.apis.common.content_types import ToolCallParseStatus
from llama_stack.apis.inference import ToolResponseMessage from llama_stack.apis.inference import ToolResponseMessage
from llama_stack.providers.utils.inference.prompt_adapter import ( from llama_stack.providers.utils.inference.prompt_adapter import (
interleaved_content_as_str, interleaved_content_as_str,
) )
@ -63,9 +62,7 @@ class EventLogger:
if isinstance(chunk, ToolResponseMessage): if isinstance(chunk, ToolResponseMessage):
yield ( yield (
chunk, chunk,
LogEvent( LogEvent(role="CustomTool", content=chunk.content, color="grey"),
role="CustomTool", content=chunk.content, color="grey"
),
) )
continue continue
@ -81,17 +78,12 @@ class EventLogger:
step_type = event.payload.step_type step_type = event.payload.step_type
# handle safety # handle safety
if ( if step_type == StepType.shield_call and event_type == EventType.step_complete.value:
step_type == StepType.shield_call
and event_type == EventType.step_complete.value
):
violation = event.payload.step_details.violation violation = event.payload.step_details.violation
if not violation: if not violation:
yield ( yield (
event, event,
LogEvent( LogEvent(role=step_type, content="No Violation", color="magenta"),
role=step_type, content="No Violation", color="magenta"
),
) )
else: else:
yield ( yield (
@ -110,9 +102,7 @@ class EventLogger:
# TODO: Currently this event is never received # TODO: Currently this event is never received
yield ( yield (
event, event,
LogEvent( LogEvent(role=step_type, content="", end="", color="yellow"),
role=step_type, content="", end="", color="yellow"
),
) )
elif event_type == EventType.step_progress.value: elif event_type == EventType.step_progress.value:
# HACK: if previous was not step/event was not inference's step_progress # HACK: if previous was not step/event was not inference's step_progress
@ -125,9 +115,7 @@ class EventLogger:
): ):
yield ( yield (
event, event,
LogEvent( LogEvent(role=step_type, content="", end="", color="yellow"),
role=step_type, content="", end="", color="yellow"
),
) )
delta = event.payload.delta delta = event.payload.delta
@ -161,9 +149,7 @@ class EventLogger:
if event_type == EventType.step_complete.value: if event_type == EventType.step_complete.value:
response = event.payload.step_details.model_response response = event.payload.step_details.model_response
if response.tool_calls: if response.tool_calls:
content = ToolUtils.encode_tool_call( content = ToolUtils.encode_tool_call(response.tool_calls[0], tool_prompt_format)
response.tool_calls[0], tool_prompt_format
)
else: else:
content = response.content content = response.content
yield ( yield (
@ -202,10 +188,7 @@ class EventLogger:
), ),
) )
if ( if step_type == StepType.memory_retrieval and event_type == EventType.step_complete.value:
step_type == StepType.memory_retrieval
and event_type == EventType.step_complete.value
):
details = event.payload.step_details details = event.payload.step_details
inserted_context = interleaved_content_as_str(details.inserted_context) inserted_context = interleaved_content_as_str(details.inserted_context)
content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}" content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}"

View file

@ -7,13 +7,15 @@
from typing import List, Optional, Protocol, runtime_checkable from typing import List, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel
from llama_stack.apis.inference import ( from llama_stack.apis.inference import (
CompletionMessage, ChatCompletionResponse,
CompletionResponse,
InterleavedContent, InterleavedContent,
LogProbConfig, LogProbConfig,
Message, Message,
ResponseFormat,
SamplingParams, SamplingParams,
ToolChoice, ToolChoice,
ToolDefinition, ToolDefinition,
@ -21,35 +23,14 @@ from llama_stack.apis.inference import (
) )
@json_schema_type
class BatchCompletionRequest(BaseModel):
model: str
content_batch: List[InterleavedContent]
sampling_params: Optional[SamplingParams] = SamplingParams()
logprobs: Optional[LogProbConfig] = None
@json_schema_type @json_schema_type
class BatchCompletionResponse(BaseModel): class BatchCompletionResponse(BaseModel):
completion_message_batch: List[CompletionMessage] batch: List[CompletionResponse]
@json_schema_type
class BatchChatCompletionRequest(BaseModel):
model: str
messages_batch: List[List[Message]]
sampling_params: Optional[SamplingParams] = SamplingParams()
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
logprobs: Optional[LogProbConfig] = None
@json_schema_type @json_schema_type
class BatchChatCompletionResponse(BaseModel): class BatchChatCompletionResponse(BaseModel):
completion_message_batch: List[CompletionMessage] batch: List[ChatCompletionResponse]
@runtime_checkable @runtime_checkable
@ -60,6 +41,7 @@ class BatchInference(Protocol):
model: str, model: str,
content_batch: List[InterleavedContent], content_batch: List[InterleavedContent],
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = SamplingParams(),
response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> BatchCompletionResponse: ... ) -> BatchCompletionResponse: ...
@ -73,5 +55,6 @@ class BatchInference(Protocol):
tools: Optional[List[ToolDefinition]] = list, tools: Optional[List[ToolDefinition]] = list,
tool_choice: Optional[ToolChoice] = ToolChoice.auto, tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None, tool_prompt_format: Optional[ToolPromptFormat] = None,
response_format: Optional[ResponseFormat] = None,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> BatchChatCompletionResponse: ... ) -> BatchChatCompletionResponse: ...

View file

@ -4,14 +4,12 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
import base64
from enum import Enum from enum import Enum
from typing import Annotated, List, Literal, Optional, Union from typing import Annotated, List, Literal, Optional, Union
from llama_models.llama3.api.datatypes import ToolCall from llama_models.llama3.api.datatypes import ToolCall
from llama_models.schema_utils import json_schema_type, register_schema from llama_models.schema_utils import json_schema_type, register_schema
from pydantic import BaseModel, Field, field_serializer, model_validator from pydantic import BaseModel, Field, model_validator
@json_schema_type @json_schema_type
@ -20,8 +18,16 @@ class URL(BaseModel):
class _URLOrData(BaseModel): class _URLOrData(BaseModel):
"""
A URL or a base64 encoded string
:param url: A URL of the image or data URL in the format of data:image/{type};base64,{data}. Note that URL could have length limits.
:param data: base64 encoded image data as string
"""
url: Optional[URL] = None url: Optional[URL] = None
data: Optional[bytes] = None # data is a base64 encoded string, hint with contentEncoding=base64
data: Optional[str] = Field(contentEncoding="base64", default=None)
@model_validator(mode="before") @model_validator(mode="before")
@classmethod @classmethod
@ -30,21 +36,27 @@ class _URLOrData(BaseModel):
return values return values
return {"url": values} return {"url": values}
@field_serializer("data")
def serialize_data(self, data: Optional[bytes], _info):
if data is None:
return None
return base64.b64encode(data).decode("utf-8")
@json_schema_type @json_schema_type
class ImageContentItem(BaseModel): class ImageContentItem(BaseModel):
"""A image content item
:param type: Discriminator type of the content item. Always "image"
:param image: Image as a base64 encoded string or an URL
"""
type: Literal["image"] = "image" type: Literal["image"] = "image"
image: _URLOrData image: _URLOrData
@json_schema_type @json_schema_type
class TextContentItem(BaseModel): class TextContentItem(BaseModel):
"""A text content item
:param type: Discriminator type of the content item. Always "text"
:param text: Text content
"""
type: Literal["text"] = "text" type: Literal["text"] = "text"
text: str text: str
@ -77,7 +89,6 @@ class ImageDelta(BaseModel):
image: bytes image: bytes
@json_schema_type
class ToolCallParseStatus(Enum): class ToolCallParseStatus(Enum):
started = "started" started = "started"
in_progress = "in_progress" in_progress = "in_progress"

View file

@ -8,7 +8,6 @@ from enum import Enum
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from llama_models.schema_utils import json_schema_type from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel from pydantic import BaseModel
from llama_stack.apis.common.content_types import URL from llama_stack.apis.common.content_types import URL

View file

@ -39,6 +39,4 @@ class DatasetIO(Protocol):
) -> PaginatedRowsResult: ... ) -> PaginatedRowsResult: ...
@webmethod(route="/datasetio/rows", method="POST") @webmethod(route="/datasetio/rows", method="POST")
async def append_rows( async def append_rows(self, dataset_id: str, rows: List[Dict[str, Any]]) -> None: ...
self, dataset_id: str, rows: List[Dict[str, Any]]
) -> None: ...

View file

@ -58,7 +58,7 @@ class Datasets(Protocol):
metadata: Optional[Dict[str, Any]] = None, metadata: Optional[Dict[str, Any]] = None,
) -> None: ... ) -> None: ...
@webmethod(route="/datasets/{dataset_id}", method="GET") @webmethod(route="/datasets/{dataset_id:path}", method="GET")
async def get_dataset( async def get_dataset(
self, self,
dataset_id: str, dataset_id: str,
@ -67,7 +67,7 @@ class Datasets(Protocol):
@webmethod(route="/datasets", method="GET") @webmethod(route="/datasets", method="GET")
async def list_datasets(self) -> ListDatasetsResponse: ... async def list_datasets(self) -> ListDatasetsResponse: ...
@webmethod(route="/datasets/{dataset_id}", method="DELETE") @webmethod(route="/datasets/{dataset_id:path}", method="DELETE")
async def unregister_dataset( async def unregister_dataset(
self, self,
dataset_id: str, dataset_id: str,

View file

@ -63,9 +63,7 @@ class AppEvalTaskConfig(BaseModel):
EvalTaskConfig = register_schema( EvalTaskConfig = register_schema(
Annotated[ Annotated[Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")],
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
],
name="EvalTaskConfig", name="EvalTaskConfig",
) )

View file

@ -13,8 +13,8 @@ from typing import (
Literal, Literal,
Optional, Optional,
Protocol, Protocol,
runtime_checkable,
Union, Union,
runtime_checkable,
) )
from llama_models.llama3.api.datatypes import ( from llama_models.llama3.api.datatypes import (
@ -31,15 +31,27 @@ from typing_extensions import Annotated
from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent from llama_stack.apis.common.content_types import ContentDelta, InterleavedContent
from llama_stack.apis.models import Model from llama_stack.apis.models import Model
from llama_stack.apis.telemetry.telemetry import MetricResponseMixin
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
class LogProbConfig(BaseModel): class LogProbConfig(BaseModel):
"""
:param top_k: How many tokens (for each position) to return log probabilities for.
"""
top_k: Optional[int] = 0 top_k: Optional[int] = 0
@json_schema_type
class QuantizationType(Enum): class QuantizationType(Enum):
"""Type of model quantization to run inference with.
:cvar bf16: BFloat16 typically this means _no_ quantization
:cvar fp8: 8-bit floating point quantization
:cvar int4: 4-bit integer quantization
"""
bf16 = "bf16" bf16 = "bf16"
fp8 = "fp8" fp8 = "fp8"
int4 = "int4" int4 = "int4"
@ -57,6 +69,12 @@ class Bf16QuantizationConfig(BaseModel):
@json_schema_type @json_schema_type
class Int4QuantizationConfig(BaseModel): class Int4QuantizationConfig(BaseModel):
"""Configuration for 4-bit integer quantization.
:param type: Must be "int4" to identify this quantization type
:param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation"
"""
type: Literal["int4"] = "int4" type: Literal["int4"] = "int4"
scheme: Optional[str] = "int4_weight_int8_dynamic_activation" scheme: Optional[str] = "int4_weight_int8_dynamic_activation"
@ -69,6 +87,13 @@ QuantizationConfig = Annotated[
@json_schema_type @json_schema_type
class UserMessage(BaseModel): class UserMessage(BaseModel):
"""A message from the user in a chat conversation.
:param role: Must be "user" to identify this as a user message
:param content: The content of the message, which can include text and other media
:param context: (Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future.
"""
role: Literal["user"] = "user" role: Literal["user"] = "user"
content: InterleavedContent content: InterleavedContent
context: Optional[InterleavedContent] = None context: Optional[InterleavedContent] = None
@ -76,15 +101,27 @@ class UserMessage(BaseModel):
@json_schema_type @json_schema_type
class SystemMessage(BaseModel): class SystemMessage(BaseModel):
"""A system message providing instructions or context to the model.
:param role: Must be "system" to identify this as a system message
:param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
"""
role: Literal["system"] = "system" role: Literal["system"] = "system"
content: InterleavedContent content: InterleavedContent
@json_schema_type @json_schema_type
class ToolResponseMessage(BaseModel): class ToolResponseMessage(BaseModel):
"""A message representing the result of a tool invocation.
:param role: Must be "tool" to identify this as a tool response
:param call_id: Unique identifier for the tool call this response is for
:param tool_name: Name of the tool that was called
:param content: The response content from the tool
"""
role: Literal["tool"] = "tool" role: Literal["tool"] = "tool"
# it was nice to re-use the ToolResponse type, but having all messages
# have a `content` type makes things nicer too
call_id: str call_id: str
tool_name: Union[BuiltinTool, str] tool_name: Union[BuiltinTool, str]
content: InterleavedContent content: InterleavedContent
@ -92,10 +129,21 @@ class ToolResponseMessage(BaseModel):
@json_schema_type @json_schema_type
class CompletionMessage(BaseModel): class CompletionMessage(BaseModel):
"""A message containing the model's (assistant) response in a chat conversation.
:param role: Must be "assistant" to identify this as the model's response
:param content: The content of the model's response
:param stop_reason: Reason why the model stopped generating. Options are:
- `StopReason.end_of_turn`: The model finished generating the entire response.
- `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
- `StopReason.out_of_tokens`: The model ran out of token budget.
:param tool_calls: List of tool calls. Each tool call is a ToolCall object.
"""
role: Literal["assistant"] = "assistant" role: Literal["assistant"] = "assistant"
content: InterleavedContent content: InterleavedContent
stop_reason: StopReason stop_reason: StopReason
tool_calls: List[ToolCall] = Field(default_factory=list) tool_calls: Optional[List[ToolCall]] = Field(default_factory=list)
Message = register_schema( Message = register_schema(
@ -129,19 +177,35 @@ class ToolResponse(BaseModel):
return v return v
@json_schema_type
class ToolChoice(Enum): class ToolChoice(Enum):
"""Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
:cvar auto: The model may use tools if it determines that is appropriate.
:cvar required: The model must use tools.
"""
auto = "auto" auto = "auto"
required = "required" required = "required"
@json_schema_type @json_schema_type
class TokenLogProbs(BaseModel): class TokenLogProbs(BaseModel):
"""Log probabilities for generated tokens.
:param logprobs_by_token: Dictionary mapping tokens to their log probabilities
"""
logprobs_by_token: Dict[str, float] logprobs_by_token: Dict[str, float]
@json_schema_type
class ChatCompletionResponseEventType(Enum): class ChatCompletionResponseEventType(Enum):
"""Types of events that can occur during chat completion.
:cvar start: Inference has started
:cvar complete: Inference is complete and a full response is available
:cvar progress: Inference is in progress and a partial response is available
"""
start = "start" start = "start"
complete = "complete" complete = "complete"
progress = "progress" progress = "progress"
@ -149,7 +213,13 @@ class ChatCompletionResponseEventType(Enum):
@json_schema_type @json_schema_type
class ChatCompletionResponseEvent(BaseModel): class ChatCompletionResponseEvent(BaseModel):
"""Chat completion response event.""" """An event during chat completion generation.
:param event_type: Type of the event
:param delta: Content generated since last event. This can be one or more tokens, or a tool call.
:param logprobs: Optional log probabilities for generated tokens
:param stop_reason: Optional reason why generation stopped, if complete
"""
event_type: ChatCompletionResponseEventType event_type: ChatCompletionResponseEventType
delta: ContentDelta delta: ContentDelta
@ -157,22 +227,37 @@ class ChatCompletionResponseEvent(BaseModel):
stop_reason: Optional[StopReason] = None stop_reason: Optional[StopReason] = None
@json_schema_type
class ResponseFormatType(Enum): class ResponseFormatType(Enum):
"""Types of formats for structured (guided) decoding.
:cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
:cvar grammar: Response should conform to a BNF grammar
"""
json_schema = "json_schema" json_schema = "json_schema"
grammar = "grammar" grammar = "grammar"
@json_schema_type @json_schema_type
class JsonSchemaResponseFormat(BaseModel): class JsonSchemaResponseFormat(BaseModel):
type: Literal[ResponseFormatType.json_schema.value] = ( """Configuration for JSON schema-guided response generation.
ResponseFormatType.json_schema.value
) :param type: Must be "json_schema" to identify this format type
:param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
"""
type: Literal[ResponseFormatType.json_schema.value] = ResponseFormatType.json_schema.value
json_schema: Dict[str, Any] json_schema: Dict[str, Any]
@json_schema_type @json_schema_type
class GrammarResponseFormat(BaseModel): class GrammarResponseFormat(BaseModel):
"""Configuration for grammar-guided response generation.
:param type: Must be "grammar" to identify this format type
:param bnf: The BNF grammar specification the response should conform to
"""
type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
bnf: Dict[str, Any] bnf: Dict[str, Any]
@ -186,20 +271,24 @@ ResponseFormat = register_schema(
) )
@json_schema_type # This is an internally used class
class CompletionRequest(BaseModel): class CompletionRequest(BaseModel):
model: str model: str
content: InterleavedContent content: InterleavedContent
sampling_params: Optional[SamplingParams] = SamplingParams() sampling_params: Optional[SamplingParams] = SamplingParams()
response_format: Optional[ResponseFormat] = None response_format: Optional[ResponseFormat] = None
stream: Optional[bool] = False stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None logprobs: Optional[LogProbConfig] = None
@json_schema_type @json_schema_type
class CompletionResponse(BaseModel): class CompletionResponse(BaseModel):
"""Completion response.""" """Response from a completion request.
:param content: The generated completion text
:param stop_reason: Reason why generation stopped
:param logprobs: Optional log probabilities for generated tokens
"""
content: str content: str
stop_reason: StopReason stop_reason: StopReason
@ -208,80 +297,95 @@ class CompletionResponse(BaseModel):
@json_schema_type @json_schema_type
class CompletionResponseStreamChunk(BaseModel): class CompletionResponseStreamChunk(BaseModel):
"""streamed completion response.""" """A chunk of a streamed completion response.
:param delta: New content generated since last chunk. This can be one or more tokens.
:param stop_reason: Optional reason why generation stopped, if complete
:param logprobs: Optional log probabilities for generated tokens
"""
delta: str delta: str
stop_reason: Optional[StopReason] = None stop_reason: Optional[StopReason] = None
logprobs: Optional[List[TokenLogProbs]] = None logprobs: Optional[List[TokenLogProbs]] = None
@json_schema_type class SystemMessageBehavior(Enum):
class BatchCompletionRequest(BaseModel): """Config for how to override the default system prompt.
model: str
content_batch: List[InterleavedContent] :cvar append: Appends the provided system message to the default system prompt:
sampling_params: Optional[SamplingParams] = SamplingParams() https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2/#-function-definitions-in-the-system-prompt-
response_format: Optional[ResponseFormat] = None :cvar replace: Replaces the default system prompt with the provided system message. The system message can include the string
logprobs: Optional[LogProbConfig] = None '{{function_definitions}}' to indicate where the function definitions should be inserted.
"""
append = "append"
replace = "replace"
@json_schema_type @json_schema_type
class BatchCompletionResponse(BaseModel): class ToolConfig(BaseModel):
"""Batch completion response.""" """Configuration for tool use.
batch: List[CompletionResponse] :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
:param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
- `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
:param system_message_behavior: (Optional) Config for how to override the default system prompt.
- `SystemMessageBehavior.append`: Appends the provided system message to the default system prompt.
- `SystemMessageBehavior.replace`: Replaces the default system prompt with the provided system message. The system message can include the string
'{{function_definitions}}' to indicate where the function definitions should be inserted.
"""
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
system_message_behavior: SystemMessageBehavior = Field(default=SystemMessageBehavior.append)
# This is an internally used class
@json_schema_type @json_schema_type
class ChatCompletionRequest(BaseModel): class ChatCompletionRequest(BaseModel):
model: str model: str
messages: List[Message] messages: List[Message]
sampling_params: Optional[SamplingParams] = SamplingParams() sampling_params: Optional[SamplingParams] = SamplingParams()
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list) tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto) tool_config: Optional[ToolConfig] = Field(default_factory=ToolConfig)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
response_format: Optional[ResponseFormat] = None
response_format: Optional[ResponseFormat] = None
stream: Optional[bool] = False stream: Optional[bool] = False
logprobs: Optional[LogProbConfig] = None logprobs: Optional[LogProbConfig] = None
@json_schema_type @json_schema_type
class ChatCompletionResponseStreamChunk(BaseModel): class ChatCompletionResponseStreamChunk(MetricResponseMixin, BaseModel):
"""SSE-stream of these events.""" """A chunk of a streamed chat completion response.
:param event: The event containing the new content
"""
event: ChatCompletionResponseEvent event: ChatCompletionResponseEvent
@json_schema_type @json_schema_type
class ChatCompletionResponse(BaseModel): class ChatCompletionResponse(MetricResponseMixin, BaseModel):
"""Chat completion response.""" """Response from a chat completion request.
:param completion_message: The complete response message
:param logprobs: Optional log probabilities for generated tokens
"""
completion_message: CompletionMessage completion_message: CompletionMessage
logprobs: Optional[List[TokenLogProbs]] = None logprobs: Optional[List[TokenLogProbs]] = None
@json_schema_type
class BatchChatCompletionRequest(BaseModel):
model: str
messages_batch: List[List[Message]]
sampling_params: Optional[SamplingParams] = SamplingParams()
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
logprobs: Optional[LogProbConfig] = None
@json_schema_type
class BatchChatCompletionResponse(BaseModel):
batch: List[ChatCompletionResponse]
@json_schema_type @json_schema_type
class EmbeddingsResponse(BaseModel): class EmbeddingsResponse(BaseModel):
"""Response containing generated embeddings.
:param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
embeddings: List[List[float]] embeddings: List[List[float]]
@ -292,6 +396,13 @@ class ModelStore(Protocol):
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class Inference(Protocol): class Inference(Protocol):
"""Llama Stack Inference API for generating completions, chat completions, and embeddings.
This API provides the raw interface to the underlying models. Two kinds of models are supported:
- LLM models: these models generate "raw" and "chat" (conversational) completions.
- Embedding models: these models generate embeddings to be used for semantic search.
"""
model_store: ModelStore model_store: ModelStore
@webmethod(route="/inference/completion", method="POST") @webmethod(route="/inference/completion", method="POST")
@ -303,7 +414,19 @@ class Inference(Protocol):
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ... ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
"""Generate a completion for the given content using the specified model.
:param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param content: The content to generate a completion for
:param sampling_params: (Optional) Parameters to control the sampling strategy
:param response_format: (Optional) Grammar specification for guided (structured) decoding
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
:param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
:returns: If stream=False, returns a CompletionResponse with the full completion.
If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
"""
...
@webmethod(route="/inference/chat-completion", method="POST") @webmethod(route="/inference/chat-completion", method="POST")
async def chat_completion( async def chat_completion(
@ -311,20 +434,50 @@ class Inference(Protocol):
model_id: str, model_id: str,
messages: List[Message], messages: List[Message],
sampling_params: Optional[SamplingParams] = SamplingParams(), sampling_params: Optional[SamplingParams] = SamplingParams(),
# zero-shot tool definitions as input to the model
tools: Optional[List[ToolDefinition]] = None, tools: Optional[List[ToolDefinition]] = None,
tool_choice: Optional[ToolChoice] = ToolChoice.auto, tool_choice: Optional[ToolChoice] = ToolChoice.auto,
tool_prompt_format: Optional[ToolPromptFormat] = None, tool_prompt_format: Optional[ToolPromptFormat] = None,
response_format: Optional[ResponseFormat] = None, response_format: Optional[ResponseFormat] = None,
stream: Optional[bool] = False, stream: Optional[bool] = False,
logprobs: Optional[LogProbConfig] = None, logprobs: Optional[LogProbConfig] = None,
) -> Union[ tool_config: Optional[ToolConfig] = None,
ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk] ) -> Union[ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]]:
]: ... """Generate a chat completion for the given messages using the specified model.
:param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
:param messages: List of messages in the conversation
:param sampling_params: Parameters to control the sampling strategy
:param tools: (Optional) List of tool definitions available to the model
:param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
.. deprecated::
Use tool_config instead.
:param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
- `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
- `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
- `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
.. deprecated::
Use tool_config instead.
:param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
- `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
- `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
:param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
:param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
:param tool_config: (Optional) Configuration for tool use.
:returns: If stream=False, returns a ChatCompletionResponse with the full completion.
If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
"""
...
@webmethod(route="/inference/embeddings", method="POST") @webmethod(route="/inference/embeddings", method="POST")
async def embeddings( async def embeddings(
self, self,
model_id: str, model_id: str,
contents: List[InterleavedContent], contents: List[InterleavedContent],
) -> EmbeddingsResponse: ... ) -> EmbeddingsResponse:
"""Generate embeddings for content pieces using the specified model.
:param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
:param contents: List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text.
:returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
"""
...

View file

@ -62,7 +62,7 @@ class Models(Protocol):
@webmethod(route="/models", method="GET") @webmethod(route="/models", method="GET")
async def list_models(self) -> ListModelsResponse: ... async def list_models(self) -> ListModelsResponse: ...
@webmethod(route="/models/{model_id}", method="GET") @webmethod(route="/models/{model_id:path}", method="GET")
async def get_model( async def get_model(
self, self,
model_id: str, model_id: str,
@ -78,7 +78,7 @@ class Models(Protocol):
model_type: Optional[ModelType] = None, model_type: Optional[ModelType] = None,
) -> Model: ... ) -> Model: ...
@webmethod(route="/models/{model_id}", method="DELETE") @webmethod(route="/models/{model_id:path}", method="DELETE")
async def unregister_model( async def unregister_model(
self, self,
model_id: str, model_id: str,

View file

@ -89,9 +89,7 @@ class QATFinetuningConfig(BaseModel):
AlgorithmConfig = register_schema( AlgorithmConfig = register_schema(
Annotated[ Annotated[Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")],
Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")
],
name="AlgorithmConfig", name="AlgorithmConfig",
) )
@ -204,14 +202,10 @@ class PostTraining(Protocol):
async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ... async def get_training_jobs(self) -> ListPostTrainingJobsResponse: ...
@webmethod(route="/post-training/job/status", method="GET") @webmethod(route="/post-training/job/status", method="GET")
async def get_training_job_status( async def get_training_job_status(self, job_uuid: str) -> Optional[PostTrainingJobStatusResponse]: ...
self, job_uuid: str
) -> Optional[PostTrainingJobStatusResponse]: ...
@webmethod(route="/post-training/job/cancel", method="POST") @webmethod(route="/post-training/job/cancel", method="POST")
async def cancel_training_job(self, job_uuid: str) -> None: ... async def cancel_training_job(self, job_uuid: str) -> None: ...
@webmethod(route="/post-training/job/artifacts", method="GET") @webmethod(route="/post-training/job/artifacts", method="GET")
async def get_training_job_artifacts( async def get_training_job_artifacts(self, job_uuid: str) -> Optional[PostTrainingJobArtifactsResponse]: ...
self, job_uuid: str
) -> Optional[PostTrainingJobArtifactsResponse]: ...

View file

@ -6,11 +6,9 @@
from enum import Enum from enum import Enum
from llama_models.schema_utils import json_schema_type
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
@json_schema_type
class ResourceType(Enum): class ResourceType(Enum):
model = "model" model = "model"
shield = "shield" shield = "shield"
@ -25,9 +23,7 @@ class ResourceType(Enum):
class Resource(BaseModel): class Resource(BaseModel):
"""Base class for all Llama Stack resources""" """Base class for all Llama Stack resources"""
identifier: str = Field( identifier: str = Field(description="Unique identifier for this resource in llama stack")
description="Unique identifier for this resource in llama stack"
)
provider_resource_id: str = Field( provider_resource_id: str = Field(
description="Unique identifier for this resource in the provider", description="Unique identifier for this resource in the provider",
@ -36,6 +32,4 @@ class Resource(BaseModel):
provider_id: str = Field(description="ID of the provider that owns this resource") provider_id: str = Field(description="ID of the provider that owns this resource")
type: ResourceType = Field( type: ResourceType = Field(description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)")
description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)"
)

View file

@ -12,8 +12,8 @@ from typing import (
Literal, Literal,
Optional, Optional,
Protocol, Protocol,
runtime_checkable,
Union, Union,
runtime_checkable,
) )
from llama_models.schema_utils import json_schema_type, register_schema, webmethod from llama_models.schema_utils import json_schema_type, register_schema, webmethod
@ -43,9 +43,7 @@ class AggregationFunctionType(Enum):
@json_schema_type @json_schema_type
class LLMAsJudgeScoringFnParams(BaseModel): class LLMAsJudgeScoringFnParams(BaseModel):
type: Literal[ScoringFnParamsType.llm_as_judge.value] = ( type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value
ScoringFnParamsType.llm_as_judge.value
)
judge_model: str judge_model: str
prompt_template: Optional[str] = None prompt_template: Optional[str] = None
judge_score_regexes: Optional[List[str]] = Field( judge_score_regexes: Optional[List[str]] = Field(
@ -60,9 +58,7 @@ class LLMAsJudgeScoringFnParams(BaseModel):
@json_schema_type @json_schema_type
class RegexParserScoringFnParams(BaseModel): class RegexParserScoringFnParams(BaseModel):
type: Literal[ScoringFnParamsType.regex_parser.value] = ( type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value
ScoringFnParamsType.regex_parser.value
)
parsing_regexes: Optional[List[str]] = Field( parsing_regexes: Optional[List[str]] = Field(
description="Regex to extract the answer from generated response", description="Regex to extract the answer from generated response",
default_factory=list, default_factory=list,
@ -112,9 +108,7 @@ class CommonScoringFnFields(BaseModel):
@json_schema_type @json_schema_type
class ScoringFn(CommonScoringFnFields, Resource): class ScoringFn(CommonScoringFnFields, Resource):
type: Literal[ResourceType.scoring_function.value] = ( type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value
ResourceType.scoring_function.value
)
@property @property
def scoring_fn_id(self) -> str: def scoring_fn_id(self) -> str:
@ -140,10 +134,8 @@ class ScoringFunctions(Protocol):
@webmethod(route="/scoring-functions", method="GET") @webmethod(route="/scoring-functions", method="GET")
async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ... async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ...
@webmethod(route="/scoring-functions/{scoring_fn_id}", method="GET") @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET")
async def get_scoring_function( async def get_scoring_function(self, scoring_fn_id: str, /) -> Optional[ScoringFn]: ...
self, scoring_fn_id: str, /
) -> Optional[ScoringFn]: ...
@webmethod(route="/scoring-functions", method="POST") @webmethod(route="/scoring-functions", method="POST")
async def register_scoring_function( async def register_scoring_function(

View file

@ -48,7 +48,7 @@ class Shields(Protocol):
@webmethod(route="/shields", method="GET") @webmethod(route="/shields", method="GET")
async def list_shields(self) -> ListShieldsResponse: ... async def list_shields(self) -> ListShieldsResponse: ...
@webmethod(route="/shields/{identifier}", method="GET") @webmethod(route="/shields/{identifier:path}", method="GET")
async def get_shield(self, identifier: str) -> Optional[Shield]: ... async def get_shield(self, identifier: str) -> Optional[Shield]: ...
@webmethod(route="/shields", method="POST") @webmethod(route="/shields", method="POST")

View file

@ -5,11 +5,9 @@
# the root directory of this source tree. # the root directory of this source tree.
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Optional, Protocol, Union from typing import Any, Dict, List, Optional, Protocol, Union
from llama_models.schema_utils import json_schema_type, webmethod from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel from pydantic import BaseModel
from llama_stack.apis.inference import Message from llama_stack.apis.inference import Message

View file

@ -13,10 +13,11 @@ from typing import (
Literal, Literal,
Optional, Optional,
Protocol, Protocol,
runtime_checkable,
Union, Union,
runtime_checkable,
) )
from llama_models.llama3.api.datatypes import Primitive
from llama_models.schema_utils import json_schema_type, register_schema, webmethod from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing_extensions import Annotated from typing_extensions import Annotated
@ -76,7 +77,7 @@ class EventCommon(BaseModel):
trace_id: str trace_id: str
span_id: str span_id: str
timestamp: datetime timestamp: datetime
attributes: Optional[Dict[str, Any]] = Field(default_factory=dict) attributes: Optional[Dict[str, Primitive]] = Field(default_factory=dict)
@json_schema_type @json_schema_type
@ -94,6 +95,30 @@ class MetricEvent(EventCommon):
unit: str unit: str
# This is a short term solution to allow inference API to return metrics
# The ideal way to do this is to have a way for all response types to include metrics
# and all metric events logged to the telemetry API to be inlcuded with the response
# To do this, we will need to augment all response types with a metrics field.
# We have hit a blocker from stainless SDK that prevents us from doing this.
# The blocker is that if we were to augment the response types that have a data field
# in them like so
# class ListModelsResponse(BaseModel):
# metrics: Optional[List[MetricEvent]] = None
# data: List[Models]
# ...
# The client SDK will need to access the data by using a .data field, which is not
# ergonomic. Stainless SDK does support unwrapping the response type, but it
# requires that the response type to only have a single field.
# We will need a way in the client SDK to signal that the metrics are needed
# and if they are needed, the client SDK has to return the full response type
# without unwrapping it.
class MetricResponseMixin(BaseModel):
metrics: Optional[List[MetricEvent]] = None
@json_schema_type @json_schema_type
class StructuredLogType(Enum): class StructuredLogType(Enum):
SPAN_START = "span_start" SPAN_START = "span_start"
@ -102,9 +127,7 @@ class StructuredLogType(Enum):
@json_schema_type @json_schema_type
class SpanStartPayload(BaseModel): class SpanStartPayload(BaseModel):
type: Literal[StructuredLogType.SPAN_START.value] = ( type: Literal[StructuredLogType.SPAN_START.value] = StructuredLogType.SPAN_START.value
StructuredLogType.SPAN_START.value
)
name: str name: str
parent_span_id: Optional[str] = None parent_span_id: Optional[str] = None
@ -190,9 +213,7 @@ class QuerySpanTreeResponse(BaseModel):
@runtime_checkable @runtime_checkable
class Telemetry(Protocol): class Telemetry(Protocol):
@webmethod(route="/telemetry/events", method="POST") @webmethod(route="/telemetry/events", method="POST")
async def log_event( async def log_event(self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400) -> None: ...
self, event: Event, ttl_seconds: int = DEFAULT_TTL_DAYS * 86400
) -> None: ...
@webmethod(route="/telemetry/traces", method="GET") @webmethod(route="/telemetry/traces", method="GET")
async def query_traces( async def query_traces(
@ -203,13 +224,13 @@ class Telemetry(Protocol):
order_by: Optional[List[str]] = None, order_by: Optional[List[str]] = None,
) -> QueryTracesResponse: ... ) -> QueryTracesResponse: ...
@webmethod(route="/telemetry/traces/{trace_id}", method="GET") @webmethod(route="/telemetry/traces/{trace_id:path}", method="GET")
async def get_trace(self, trace_id: str) -> Trace: ... async def get_trace(self, trace_id: str) -> Trace: ...
@webmethod(route="/telemetry/traces/{trace_id}/spans/{span_id}", method="GET") @webmethod(route="/telemetry/traces/{trace_id:path}/spans/{span_id:path}", method="GET")
async def get_span(self, trace_id: str, span_id: str) -> Span: ... async def get_span(self, trace_id: str, span_id: str) -> Span: ...
@webmethod(route="/telemetry/spans/{span_id}/tree", method="GET") @webmethod(route="/telemetry/spans/{span_id:path}/tree", method="GET")
async def get_span_tree( async def get_span_tree(
self, self,
span_id: str, span_id: str,

View file

@ -4,5 +4,5 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from .tools import * # noqa: F401 F403
from .rag_tool import * # noqa: F401 F403 from .rag_tool import * # noqa: F401 F403
from .tools import * # noqa: F401 F403

View file

@ -11,7 +11,7 @@ from llama_models.schema_utils import json_schema_type, register_schema, webmeth
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing_extensions import Annotated, Protocol, runtime_checkable from typing_extensions import Annotated, Protocol, runtime_checkable
from llama_stack.apis.common.content_types import InterleavedContent, URL from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -64,9 +64,7 @@ RAGQueryGeneratorConfig = register_schema(
class RAGQueryConfig(BaseModel): class RAGQueryConfig(BaseModel):
# This config defines how a query is generated using the messages # This config defines how a query is generated using the messages
# for memory bank retrieval. # for memory bank retrieval.
query_generator_config: RAGQueryGeneratorConfig = Field( query_generator_config: RAGQueryGeneratorConfig = Field(default=DefaultRAGQueryGeneratorConfig())
default=DefaultRAGQueryGeneratorConfig()
)
max_tokens_in_context: int = 4096 max_tokens_in_context: int = 4096
max_chunks: int = 5 max_chunks: int = 5

View file

@ -11,7 +11,7 @@ from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing_extensions import Protocol, runtime_checkable from typing_extensions import Protocol, runtime_checkable
from llama_stack.apis.common.content_types import InterleavedContent, URL from llama_stack.apis.common.content_types import URL, InterleavedContent
from llama_stack.apis.resource import Resource, ResourceType from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -101,7 +101,7 @@ class ToolGroups(Protocol):
"""Register a tool group""" """Register a tool group"""
... ...
@webmethod(route="/toolgroups/{toolgroup_id}", method="GET") @webmethod(route="/toolgroups/{toolgroup_id:path}", method="GET")
async def get_tool_group( async def get_tool_group(
self, self,
toolgroup_id: str, toolgroup_id: str,
@ -117,13 +117,13 @@ class ToolGroups(Protocol):
"""List tools with optional tool group""" """List tools with optional tool group"""
... ...
@webmethod(route="/tools/{tool_name}", method="GET") @webmethod(route="/tools/{tool_name:path}", method="GET")
async def get_tool( async def get_tool(
self, self,
tool_name: str, tool_name: str,
) -> Tool: ... ) -> Tool: ...
@webmethod(route="/toolgroups/{toolgroup_id}", method="DELETE") @webmethod(route="/toolgroups/{toolgroup_id:path}", method="DELETE")
async def unregister_toolgroup( async def unregister_toolgroup(
self, self,
toolgroup_id: str, toolgroup_id: str,
@ -150,8 +150,6 @@ class ToolRuntime(Protocol):
) -> List[ToolDef]: ... ) -> List[ToolDef]: ...
@webmethod(route="/tool-runtime/invoke", method="POST") @webmethod(route="/tool-runtime/invoke", method="POST")
async def invoke_tool( async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> ToolInvocationResult:
self, tool_name: str, kwargs: Dict[str, Any]
) -> ToolInvocationResult:
"""Run a tool with the given arguments""" """Run a tool with the given arguments"""
... ...

View file

@ -46,7 +46,7 @@ class VectorDBs(Protocol):
@webmethod(route="/vector-dbs", method="GET") @webmethod(route="/vector-dbs", method="GET")
async def list_vector_dbs(self) -> ListVectorDBsResponse: ... async def list_vector_dbs(self) -> ListVectorDBsResponse: ...
@webmethod(route="/vector-dbs/{vector_db_id}", method="GET") @webmethod(route="/vector-dbs/{vector_db_id:path}", method="GET")
async def get_vector_db( async def get_vector_db(
self, self,
vector_db_id: str, vector_db_id: str,
@ -62,5 +62,5 @@ class VectorDBs(Protocol):
provider_vector_db_id: Optional[str] = None, provider_vector_db_id: Optional[str] = None,
) -> VectorDB: ... ) -> VectorDB: ...
@webmethod(route="/vector-dbs/{vector_db_id}", method="DELETE") @webmethod(route="/vector-dbs/{vector_db_id:path}", method="DELETE")
async def unregister_vector_db(self, vector_db_id: str) -> None: ... async def unregister_vector_db(self, vector_db_id: str) -> None: ...

View file

@ -16,11 +16,9 @@ from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional
import httpx import httpx
from llama_models.datatypes import Model from llama_models.datatypes import Model
from llama_models.sku_list import LlamaDownloadInfo from llama_models.sku_list import LlamaDownloadInfo
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
from rich.console import Console from rich.console import Console
from rich.progress import ( from rich.progress import (
BarColumn, BarColumn,
@ -147,9 +145,7 @@ class ParallelDownloader:
"follow_redirects": True, "follow_redirects": True,
} }
async def retry_with_exponential_backoff( async def retry_with_exponential_backoff(self, task: DownloadTask, func, *args, **kwargs):
self, task: DownloadTask, func, *args, **kwargs
):
last_exception = None last_exception = None
for attempt in range(task.max_retries): for attempt in range(task.max_retries):
try: try:
@ -166,13 +162,9 @@ class ParallelDownloader:
continue continue
raise last_exception raise last_exception
async def get_file_info( async def get_file_info(self, client: httpx.AsyncClient, task: DownloadTask) -> None:
self, client: httpx.AsyncClient, task: DownloadTask
) -> None:
async def _get_info(): async def _get_info():
response = await client.head( response = await client.head(task.url, headers={"Accept-Encoding": "identity"}, **self.client_options)
task.url, headers={"Accept-Encoding": "identity"}, **self.client_options
)
response.raise_for_status() response.raise_for_status()
return response return response
@ -201,14 +193,10 @@ class ParallelDownloader:
return False return False
return os.path.getsize(task.output_file) == task.total_size return os.path.getsize(task.output_file) == task.total_size
async def download_chunk( async def download_chunk(self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int) -> None:
self, client: httpx.AsyncClient, task: DownloadTask, start: int, end: int
) -> None:
async def _download_chunk(): async def _download_chunk():
headers = {"Range": f"bytes={start}-{end}"} headers = {"Range": f"bytes={start}-{end}"}
async with client.stream( async with client.stream("GET", task.url, headers=headers, **self.client_options) as response:
"GET", task.url, headers=headers, **self.client_options
) as response:
response.raise_for_status() response.raise_for_status()
with open(task.output_file, "ab") as file: with open(task.output_file, "ab") as file:
@ -225,8 +213,7 @@ class ParallelDownloader:
await self.retry_with_exponential_backoff(task, _download_chunk) await self.retry_with_exponential_backoff(task, _download_chunk)
except Exception as e: except Exception as e:
raise DownloadError( raise DownloadError(
f"Failed to download chunk {start}-{end} after " f"Failed to download chunk {start}-{end} after {task.max_retries} attempts: {str(e)}"
f"{task.max_retries} attempts: {str(e)}"
) from e ) from e
async def prepare_download(self, task: DownloadTask) -> None: async def prepare_download(self, task: DownloadTask) -> None:
@ -244,9 +231,7 @@ class ParallelDownloader:
# Check if file is already downloaded # Check if file is already downloaded
if os.path.exists(task.output_file): if os.path.exists(task.output_file):
if self.verify_file_integrity(task): if self.verify_file_integrity(task):
self.console.print( self.console.print(f"[green]Already downloaded {task.output_file}[/green]")
f"[green]Already downloaded {task.output_file}[/green]"
)
self.progress.update(task.task_id, completed=task.total_size) self.progress.update(task.task_id, completed=task.total_size)
return return
@ -259,9 +244,7 @@ class ParallelDownloader:
current_pos = task.downloaded_size current_pos = task.downloaded_size
while current_pos < task.total_size: while current_pos < task.total_size:
chunk_end = min( chunk_end = min(current_pos + chunk_size - 1, task.total_size - 1)
current_pos + chunk_size - 1, task.total_size - 1
)
chunks.append((current_pos, chunk_end)) chunks.append((current_pos, chunk_end))
current_pos = chunk_end + 1 current_pos = chunk_end + 1
@ -273,18 +256,12 @@ class ParallelDownloader:
raise DownloadError(f"Download failed: {str(e)}") from e raise DownloadError(f"Download failed: {str(e)}") from e
except Exception as e: except Exception as e:
self.progress.update( self.progress.update(task.task_id, description=f"[red]Failed: {task.output_file}[/red]")
task.task_id, description=f"[red]Failed: {task.output_file}[/red]" raise DownloadError(f"Download failed for {task.output_file}: {str(e)}") from e
)
raise DownloadError(
f"Download failed for {task.output_file}: {str(e)}"
) from e
def has_disk_space(self, tasks: List[DownloadTask]) -> bool: def has_disk_space(self, tasks: List[DownloadTask]) -> bool:
try: try:
total_remaining_size = sum( total_remaining_size = sum(task.total_size - task.downloaded_size for task in tasks)
task.total_size - task.downloaded_size for task in tasks
)
dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file)) dir_path = os.path.dirname(os.path.abspath(tasks[0].output_file))
free_space = shutil.disk_usage(dir_path).free free_space = shutil.disk_usage(dir_path).free
@ -314,9 +291,7 @@ class ParallelDownloader:
with self.progress: with self.progress:
for task in tasks: for task in tasks:
desc = f"Downloading {Path(task.output_file).name}" desc = f"Downloading {Path(task.output_file).name}"
task.task_id = self.progress.add_task( task.task_id = self.progress.add_task(desc, total=task.total_size, completed=task.downloaded_size)
desc, total=task.total_size, completed=task.downloaded_size
)
semaphore = asyncio.Semaphore(self.max_concurrent_downloads) semaphore = asyncio.Semaphore(self.max_concurrent_downloads)
@ -332,9 +307,7 @@ class ParallelDownloader:
if failed_tasks: if failed_tasks:
self.console.print("\n[red]Some downloads failed:[/red]") self.console.print("\n[red]Some downloads failed:[/red]")
for task, error in failed_tasks: for task, error in failed_tasks:
self.console.print( self.console.print(f"[red]- {Path(task.output_file).name}: {error}[/red]")
f"[red]- {Path(task.output_file).name}: {error}[/red]"
)
raise DownloadError(f"{len(failed_tasks)} downloads failed") raise DownloadError(f"{len(failed_tasks)} downloads failed")
@ -396,11 +369,7 @@ def _meta_download(
output_file = str(output_dir / f) output_file = str(output_dir / f)
url = meta_url.replace("*", f"{info.folder}/{f}") url = meta_url.replace("*", f"{info.folder}/{f}")
total_size = info.pth_size if "consolidated" in f else 0 total_size = info.pth_size if "consolidated" in f else 0
tasks.append( tasks.append(DownloadTask(url=url, output_file=output_file, total_size=total_size, max_retries=3))
DownloadTask(
url=url, output_file=output_file, total_size=total_size, max_retries=3
)
)
# Initialize and run parallel downloader # Initialize and run parallel downloader
downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads) downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
@ -446,14 +415,10 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
if any(output_dir.iterdir()): if any(output_dir.iterdir()):
console.print( console.print(f"[yellow]Output directory {output_dir} is not empty.[/yellow]")
f"[yellow]Output directory {output_dir} is not empty.[/yellow]"
)
while True: while True:
resp = input( resp = input("Do you want to (C)ontinue download or (R)estart completely? (continue/restart): ")
"Do you want to (C)ontinue download or (R)estart completely? (continue/restart): "
)
if resp.lower() in ["restart", "r"]: if resp.lower() in ["restart", "r"]:
shutil.rmtree(output_dir) shutil.rmtree(output_dir)
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
@ -471,9 +436,7 @@ def _download_from_manifest(manifest_file: str, max_concurrent_downloads: int):
] ]
# Initialize and run parallel downloader # Initialize and run parallel downloader
downloader = ParallelDownloader( downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
max_concurrent_downloads=max_concurrent_downloads
)
asyncio.run(downloader.download_all(tasks)) asyncio.run(downloader.download_all(tasks))

View file

@ -8,7 +8,6 @@ import argparse
import json import json
from llama_models.sku_list import resolve_model from llama_models.sku_list import resolve_model
from termcolor import colored from termcolor import colored
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand

View file

@ -38,7 +38,7 @@ class ModelList(Subcommand):
headers = [ headers = [
"Model Descriptor", "Model Descriptor",
"Hugging Face Repo", "Model ID",
"Context Length", "Context Length",
] ]

View file

@ -11,7 +11,6 @@ from llama_stack.cli.model.download import ModelDownload
from llama_stack.cli.model.list import ModelList from llama_stack.cli.model.list import ModelList
from llama_stack.cli.model.prompt_format import ModelPromptFormat from llama_stack.cli.model.prompt_format import ModelPromptFormat
from llama_stack.cli.model.verify_download import ModelVerifyDownload from llama_stack.cli.model.verify_download import ModelVerifyDownload
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
@ -26,6 +25,8 @@ class ModelParser(Subcommand):
description="Work with llama models", description="Work with llama models",
) )
self.parser.set_defaults(func=lambda args: self.parser.print_help())
subparsers = self.parser.add_subparsers(title="model_subcommands") subparsers = self.parser.add_subparsers(title="model_subcommands")
# Add sub-commands # Add sub-commands

View file

@ -8,7 +8,7 @@ import argparse
import textwrap import textwrap
from io import StringIO from io import StringIO
from llama_models.datatypes import CoreModelId, is_multimodal, model_family, ModelFamily from llama_models.datatypes import CoreModelId, ModelFamily, is_multimodal, model_family
from llama_stack.cli.subcommand import Subcommand from llama_stack.cli.subcommand import Subcommand
@ -47,33 +47,20 @@ class ModelPromptFormat(Subcommand):
# Only Llama 3.1 and 3.2 are supported # Only Llama 3.1 and 3.2 are supported
supported_model_ids = [ supported_model_ids = [
m m for m in CoreModelId if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
for m in CoreModelId
if model_family(m) in {ModelFamily.llama3_1, ModelFamily.llama3_2}
] ]
model_str = "\n".join([m.value for m in supported_model_ids]) model_str = "\n".join([m.value for m in supported_model_ids])
try: try:
model_id = CoreModelId(args.model_name) model_id = CoreModelId(args.model_name)
except ValueError: except ValueError:
self.parser.error( self.parser.error(f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}")
f"{args.model_name} is not a valid Model. Choose one from --\n{model_str}"
)
if model_id not in supported_model_ids: if model_id not in supported_model_ids:
self.parser.error( self.parser.error(f"{model_id} is not a valid Model. Choose one from --\n {model_str}")
f"{model_id} is not a valid Model. Choose one from --\n {model_str}"
)
llama_3_1_file = ( llama_3_1_file = importlib.resources.files("llama_models") / "llama3_1/prompt_format.md"
importlib.resources.files("llama_models") / "llama3_1/prompt_format.md" llama_3_2_text_file = importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md"
) llama_3_2_vision_file = importlib.resources.files("llama_models") / "llama3_2/vision_prompt_format.md"
llama_3_2_text_file = (
importlib.resources.files("llama_models") / "llama3_2/text_prompt_format.md"
)
llama_3_2_vision_file = (
importlib.resources.files("llama_models")
/ "llama3_2/vision_prompt_format.md"
)
if model_family(model_id) == ModelFamily.llama3_1: if model_family(model_id) == ModelFamily.llama3_1:
with importlib.resources.as_file(llama_3_1_file) as f: with importlib.resources.as_file(llama_3_1_file) as f:
content = f.open("r").read() content = f.open("r").read()

View file

@ -9,7 +9,6 @@ from typing import Any, Dict, Optional
from llama_models.datatypes import CheckpointQuantizationFormat from llama_models.datatypes import CheckpointQuantizationFormat
from llama_models.llama3.api.datatypes import SamplingParams from llama_models.llama3.api.datatypes import SamplingParams
from llama_models.sku_list import LlamaDownloadInfo from llama_models.sku_list import LlamaDownloadInfo
from pydantic import BaseModel, ConfigDict, Field from pydantic import BaseModel, ConfigDict, Field
@ -17,16 +16,12 @@ class PromptGuardModel(BaseModel):
"""Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed.""" """Make a 'fake' Model-like object for Prompt Guard. Eventually this will be removed."""
model_id: str = "Prompt-Guard-86M" model_id: str = "Prompt-Guard-86M"
description: str = ( description: str = "Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
"Prompt Guard. NOTE: this model will not be provided via `llama` CLI soon."
)
is_featured: bool = False is_featured: bool = False
huggingface_repo: str = "meta-llama/Prompt-Guard-86M" huggingface_repo: str = "meta-llama/Prompt-Guard-86M"
max_seq_length: int = 2048 max_seq_length: int = 2048
is_instruct_model: bool = False is_instruct_model: bool = False
quantization_format: CheckpointQuantizationFormat = ( quantization_format: CheckpointQuantizationFormat = CheckpointQuantizationFormat.bf16
CheckpointQuantizationFormat.bf16
)
arch_args: Dict[str, Any] = Field(default_factory=dict) arch_args: Dict[str, Any] = Field(default_factory=dict)
recommended_sampling_params: Optional[SamplingParams] = None recommended_sampling_params: Optional[SamplingParams] = None

View file

@ -21,8 +21,12 @@ from prompt_toolkit.validation import Validator
from termcolor import cprint from termcolor import cprint
from llama_stack.cli.table import print_table from llama_stack.cli.table import print_table
from llama_stack.distribution.build import (
from llama_stack.distribution.build import build_image, ImageType SERVER_DEPENDENCIES,
ImageType,
build_image,
get_provider_dependencies,
)
from llama_stack.distribution.datatypes import ( from llama_stack.distribution.datatypes import (
BuildConfig, BuildConfig,
DistributionSpec, DistributionSpec,
@ -35,7 +39,6 @@ from llama_stack.distribution.utils.config_dirs import DISTRIBS_BASE_DIR
from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.distribution.utils.dynamic import instantiate_class_type
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates" TEMPLATES_PATH = Path(__file__).parent.parent.parent / "templates"
@ -52,9 +55,7 @@ def available_templates_specs() -> Dict[str, BuildConfig]:
return template_specs return template_specs
def run_stack_build_command( def run_stack_build_command(args: argparse.Namespace) -> None:
parser: argparse.ArgumentParser, args: argparse.Namespace
) -> None:
if args.list_templates: if args.list_templates:
return _run_template_list_cmd() return _run_template_list_cmd()
@ -74,18 +75,11 @@ def run_stack_build_command(
build_config.image_type = args.image_type build_config.image_type = args.image_type
else: else:
cprint( cprint(
f"Please specify a image-type (docker | conda | venv) for {args.template}", f"Please specify a image-type (container | conda | venv) for {args.template}",
color="red", color="red",
) )
return return
_run_stack_build_command_from_build_config( elif not args.config and not args.template:
build_config,
image_name=image_name,
template_name=args.template,
)
return
if not args.config and not args.template:
name = prompt( name = prompt(
"> Enter a name for your Llama Stack (e.g. my-local-stack): ", "> Enter a name for your Llama Stack (e.g. my-local-stack): ",
validator=Validator.from_callable( validator=Validator.from_callable(
@ -95,10 +89,10 @@ def run_stack_build_command(
) )
image_type = prompt( image_type = prompt(
"> Enter the image type you want your Llama Stack to be built as (docker or conda or venv): ", "> Enter the image type you want your Llama Stack to be built as (container or conda or venv): ",
validator=Validator.from_callable( validator=Validator.from_callable(
lambda x: x in ["docker", "conda", "venv"], lambda x: x in ["container", "conda", "venv"],
error_message="Invalid image type, please enter conda or docker or venv", error_message="Invalid image type, please enter conda or container or venv",
), ),
default="conda", default="conda",
) )
@ -132,11 +126,7 @@ def run_stack_build_command(
providers = dict() providers = dict()
for api, providers_for_api in get_provider_registry().items(): for api, providers_for_api in get_provider_registry().items():
available_providers = [ available_providers = [x for x in providers_for_api.keys() if x not in ("remote", "remote::sample")]
x
for x in providers_for_api.keys()
if x not in ("remote", "remote::sample")
]
api_provider = prompt( api_provider = prompt(
"> Enter provider for API {}: ".format(api.value), "> Enter provider for API {}: ".format(api.value),
completer=WordCompleter(available_providers), completer=WordCompleter(available_providers),
@ -159,9 +149,7 @@ def run_stack_build_command(
description=description, description=description,
) )
build_config = BuildConfig( build_config = BuildConfig(image_type=image_type, distribution_spec=distribution_spec)
image_type=image_type, distribution_spec=distribution_spec
)
else: else:
with open(args.config, "r") as f: with open(args.config, "r") as f:
try: try:
@ -180,8 +168,20 @@ def run_stack_build_command(
) )
return return
if args.print_deps_only:
print(f"# Dependencies for {args.template or args.config or image_name}")
normal_deps, special_deps = get_provider_dependencies(build_config.distribution_spec.providers)
normal_deps += SERVER_DEPENDENCIES
print(f"uv pip install {' '.join(normal_deps)}")
for special_dep in special_deps:
print(f"uv pip install {special_dep}")
return
_run_stack_build_command_from_build_config( _run_stack_build_command_from_build_config(
build_config, image_name=image_name, config_path=args.config build_config,
image_name=image_name,
config_path=args.config,
template_name=args.template,
) )
@ -195,9 +195,7 @@ def _generate_run_config(
""" """
apis = list(build_config.distribution_spec.providers.keys()) apis = list(build_config.distribution_spec.providers.keys())
run_config = StackRunConfig( run_config = StackRunConfig(
container_image=( container_image=(image_name if build_config.image_type == ImageType.container.value else None),
image_name if build_config.image_type == ImageType.container.value else None
),
image_name=image_name, image_name=image_name,
apis=apis, apis=apis,
providers={}, providers={},
@ -217,13 +215,9 @@ def _generate_run_config(
if p.deprecation_error: if p.deprecation_error:
raise InvalidProviderError(p.deprecation_error) raise InvalidProviderError(p.deprecation_error)
config_type = instantiate_class_type( config_type = instantiate_class_type(provider_registry[Api(api)][provider_type].config_class)
provider_registry[Api(api)][provider_type].config_class
)
if hasattr(config_type, "sample_run_config"): if hasattr(config_type, "sample_run_config"):
config = config_type.sample_run_config( config = config_type.sample_run_config(__distro_dir__=f"distributions/{image_name}")
__distro_dir__=f"distributions/{image_name}"
)
else: else:
config = {} config = {}
@ -258,9 +252,7 @@ def _run_stack_build_command_from_build_config(
image_name = f"distribution-{template_name}" image_name = f"distribution-{template_name}"
else: else:
if not image_name: if not image_name:
raise ValueError( raise ValueError("Please specify an image name when building a container image without a template")
"Please specify an image name when building a docker image without a template"
)
elif build_config.image_type == ImageType.conda.value: elif build_config.image_type == ImageType.conda.value:
if not image_name: if not image_name:
raise ValueError("Please specify an image name when building a conda image") raise ValueError("Please specify an image name when building a conda image")
@ -288,10 +280,7 @@ def _run_stack_build_command_from_build_config(
if template_name: if template_name:
# copy run.yaml from template to build_dir instead of generating it again # copy run.yaml from template to build_dir instead of generating it again
template_path = ( template_path = importlib.resources.files("llama_stack") / f"templates/{template_name}/run.yaml"
importlib.resources.files("llama_stack")
/ f"templates/{template_name}/run.yaml"
)
with importlib.resources.as_file(template_path) as path: with importlib.resources.as_file(template_path) as path:
run_config_file = build_dir / f"{template_name}-run.yaml" run_config_file = build_dir / f"{template_name}-run.yaml"
shutil.copy(path, run_config_file) shutil.copy(path, run_config_file)

View file

@ -63,10 +63,16 @@ environment is active, you must specify a name.
), ),
default=None, default=None,
) )
self.parser.add_argument(
"--print-deps-only",
default=False,
action="store_true",
help="Print the dependencies for the stack only, without building the stack",
)
def _run_stack_build_command(self, args: argparse.Namespace) -> None: def _run_stack_build_command(self, args: argparse.Namespace) -> None:
# always keep implementation completely silo-ed away from CLI so CLI # always keep implementation completely silo-ed away from CLI so CLI
# can be fast to load and reduces dependencies # can be fast to load and reduces dependencies
from ._build import run_stack_build_command from ._build import run_stack_build_command
return run_stack_build_command(self.parser, args) return run_stack_build_command(args)

View file

@ -21,15 +21,19 @@ class StackListProviders(Subcommand):
self._add_arguments() self._add_arguments()
self.parser.set_defaults(func=self._run_providers_list_cmd) self.parser.set_defaults(func=self._run_providers_list_cmd)
def _add_arguments(self): @property
from llama_stack.distribution.datatypes import Api def providable_apis(self):
from llama_stack.distribution.distribution import providable_apis
api_values = [a.value for a in Api] return [api.value for api in providable_apis()]
def _add_arguments(self):
self.parser.add_argument( self.parser.add_argument(
"api", "api",
type=str, type=str,
choices=api_values, choices=self.providable_apis,
help="API to list providers for (one of: {})".format(api_values), nargs="?",
help="API to list providers for. List all if not specified.",
) )
def _run_providers_list_cmd(self, args: argparse.Namespace) -> None: def _run_providers_list_cmd(self, args: argparse.Namespace) -> None:
@ -37,20 +41,29 @@ class StackListProviders(Subcommand):
from llama_stack.distribution.distribution import Api, get_provider_registry from llama_stack.distribution.distribution import Api, get_provider_registry
all_providers = get_provider_registry() all_providers = get_provider_registry()
providers_for_api = all_providers[Api(args.api)] if args.api:
providers = [(args.api, all_providers[Api(args.api)])]
else:
providers = [(k.value, prov) for k, prov in all_providers.items()]
providers = [p for api, p in providers if api in self.providable_apis]
# eventually, this should query a registry at llama.meta.com/llamastack/distributions # eventually, this should query a registry at llama.meta.com/llamastack/distributions
headers = [ headers = [
"API Type",
"Provider Type", "Provider Type",
"PIP Package Dependencies", "PIP Package Dependencies",
] ]
rows = [] rows = []
for spec in providers_for_api.values():
if spec.provider_type == "sample": specs = [spec for p in providers for spec in p.values()]
for spec in specs:
if spec.is_sample:
continue continue
rows.append( rows.append(
[ [
spec.api.value,
spec.provider_type, spec.provider_type,
",".join(spec.pip_packages), ",".join(spec.pip_packages),
] ]
@ -59,4 +72,5 @@ class StackListProviders(Subcommand):
rows, rows,
headers, headers,
separate_rows=True, separate_rows=True,
sort_by=(0, 1),
) )

View file

@ -55,6 +55,23 @@ class StackRun(Subcommand):
default=[], default=[],
metavar="KEY=VALUE", metavar="KEY=VALUE",
) )
self.parser.add_argument(
"--tls-keyfile",
type=str,
help="Path to TLS key file for HTTPS",
)
self.parser.add_argument(
"--tls-certfile",
type=str,
help="Path to TLS certificate file for HTTPS",
)
self.parser.add_argument(
"--image-type",
type=str,
help="Image Type used during the build. This can be either conda or container or venv.",
choices=["conda", "container", "venv"],
default="conda",
)
def _run_stack_run_cmd(self, args: argparse.Namespace) -> None: def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
import importlib.resources import importlib.resources
@ -82,31 +99,21 @@ class StackRun(Subcommand):
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if this is a template # check if this is a template
config_file = ( config_file = Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
)
if config_file.exists(): if config_file.exists():
template_name = args.config template_name = args.config
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to conda dir # check if it's a build config saved to conda dir
config_file = Path( config_file = Path(BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml")
BUILDS_BASE_DIR / ImageType.conda.value / f"{args.config}-run.yaml"
)
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to container dir # check if it's a build config saved to container dir
config_file = Path( config_file = Path(BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml")
BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml"
)
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to ~/.llama dir # check if it's a build config saved to ~/.llama dir
config_file = Path( config_file = Path(DISTRIBS_BASE_DIR / f"llamastack-{args.config}" / f"{args.config}-run.yaml")
DISTRIBS_BASE_DIR
/ f"llamastack-{args.config}"
/ f"{args.config}-run.yaml"
)
if not config_file.exists(): if not config_file.exists():
self.parser.error( self.parser.error(
@ -118,18 +125,11 @@ class StackRun(Subcommand):
config_dict = yaml.safe_load(config_file.read_text()) config_dict = yaml.safe_load(config_file.read_text())
config = parse_and_maybe_upgrade_config(config_dict) config = parse_and_maybe_upgrade_config(config_dict)
if config.container_image: if args.image_type == ImageType.container.value or config.container_image:
script = ( script = importlib.resources.files("llama_stack") / "distribution/start_container.sh"
importlib.resources.files("llama_stack") image_name = f"distribution-{template_name}" if template_name else config.container_image
/ "distribution/start_container.sh"
)
image_name = (
f"distribution-{template_name}"
if template_name
else config.container_image
)
run_args = [script, image_name] run_args = [script, image_name]
else: elif args.image_type == ImageType.conda.value:
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
image_name = args.image_name or current_conda_env image_name = args.image_name or current_conda_env
if not image_name: if not image_name:
@ -140,12 +140,12 @@ class StackRun(Subcommand):
return return
def get_conda_prefix(env_name): def get_conda_prefix(env_name):
# Conda "base" environment does not end with "base" in the
# prefix, so should be handled separately.
if env_name == "base":
return os.environ.get("CONDA_PREFIX")
# Get conda environments info # Get conda environments info
conda_env_info = json.loads( conda_env_info = json.loads(subprocess.check_output(["conda", "info", "--envs", "--json"]).decode())
subprocess.check_output(
["conda", "info", "--envs", "--json"]
).decode()
)
envs = conda_env_info["envs"] envs = conda_env_info["envs"]
for envpath in envs: for envpath in envs:
if envpath.endswith(env_name): if envpath.endswith(env_name):
@ -169,14 +169,20 @@ class StackRun(Subcommand):
) )
return return
script = ( script = importlib.resources.files("llama_stack") / "distribution/start_conda_env.sh"
importlib.resources.files("llama_stack")
/ "distribution/start_conda_env.sh"
)
run_args = [ run_args = [
script, script,
image_name, image_name,
] ]
else:
# else must be venv since that is the only valid option left.
current_venv = os.environ.get("VIRTUAL_ENV")
venv = args.image_name or current_venv
script = importlib.resources.files("llama_stack") / "distribution/start_venv.sh"
run_args = [
script,
venv,
]
run_args.extend([str(config_file), str(args.port)]) run_args.extend([str(config_file), str(args.port)])
if args.disable_ipv6: if args.disable_ipv6:
@ -198,4 +204,7 @@ class StackRun(Subcommand):
return return
run_args.extend(["--env", f"{key}={value}"]) run_args.extend(["--env", f"{key}={value}"])
if args.tls_keyfile and args.tls_certfile:
run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
run_with_pty(run_args) run_with_pty(run_args)

View file

@ -31,6 +31,8 @@ class StackParser(Subcommand):
version=f"{version('llama-stack')}", version=f"{version('llama-stack')}",
) )
self.parser.set_defaults(func=lambda args: self.parser.print_help())
subparsers = self.parser.add_subparsers(title="stack_subcommands") subparsers = self.parser.add_subparsers(title="stack_subcommands")
# Add sub-commands # Add sub-commands

View file

@ -6,6 +6,7 @@
import re import re
import textwrap import textwrap
from typing import Iterable
from termcolor import cprint from termcolor import cprint
@ -22,11 +23,7 @@ def format_row(row, col_widths):
if line.strip() == "": if line.strip() == "":
lines.append("") lines.append("")
else: else:
lines.extend( lines.extend(textwrap.wrap(line, width, break_long_words=False, replace_whitespace=False))
textwrap.wrap(
line, width, break_long_words=False, replace_whitespace=False
)
)
return lines return lines
wrapped = [wrap(item, width) for item, width in zip(row, col_widths)] wrapped = [wrap(item, width) for item, width in zip(row, col_widths)]
@ -43,11 +40,15 @@ def format_row(row, col_widths):
return "\n".join(lines) return "\n".join(lines)
def print_table(rows, headers=None, separate_rows: bool = False): def print_table(rows, headers=None, separate_rows: bool = False, sort_by: Iterable[int] = tuple()):
def itemlen(item): def itemlen(item):
return max([len(line) for line in strip_ansi_colors(item).split("\n")]) return max([len(line) for line in strip_ansi_colors(item).split("\n")])
rows = [[x or "" for x in row] for row in rows] rows = [[x or "" for x in row] for row in rows]
if sort_by:
rows.sort(key=lambda x: tuple(x[i] for i in sort_by))
if not headers: if not headers:
col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)] col_widths = [max(itemlen(item) for item in col) for col in zip(*rows)]
else: else:

View file

@ -8,6 +8,7 @@ from datetime import datetime
import pytest import pytest
import yaml import yaml
from llama_stack.distribution.configure import ( from llama_stack.distribution.configure import (
LLAMA_STACK_RUN_CONFIG_VERSION, LLAMA_STACK_RUN_CONFIG_VERSION,
parse_and_maybe_upgrade_config, parse_and_maybe_upgrade_config,
@ -41,9 +42,7 @@ def up_to_date_config():
- provider_id: provider1 - provider_id: provider1
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: {{}} config: {{}}
""".format( """.format(version=LLAMA_STACK_RUN_CONFIG_VERSION, built_at=datetime.now().isoformat())
version=LLAMA_STACK_RUN_CONFIG_VERSION, built_at=datetime.now().isoformat()
)
) )
@ -83,9 +82,7 @@ def old_config():
telemetry: telemetry:
provider_type: noop provider_type: noop
config: {{}} config: {{}}
""".format( """.format(built_at=datetime.now().isoformat())
built_at=datetime.now().isoformat()
)
) )
@ -108,10 +105,7 @@ def test_parse_and_maybe_upgrade_config_up_to_date(up_to_date_config):
def test_parse_and_maybe_upgrade_config_old_format(old_config): def test_parse_and_maybe_upgrade_config_old_format(old_config):
result = parse_and_maybe_upgrade_config(old_config) result = parse_and_maybe_upgrade_config(old_config)
assert result.version == LLAMA_STACK_RUN_CONFIG_VERSION assert result.version == LLAMA_STACK_RUN_CONFIG_VERSION
assert all( assert all(api in result.providers for api in ["inference", "safety", "memory", "telemetry"])
api in result.providers
for api in ["inference", "safety", "memory", "telemetry"]
)
safety_provider = result.providers["safety"][0] safety_provider = result.providers["safety"][0]
assert safety_provider.provider_type == "meta-reference" assert safety_provider.provider_type == "meta-reference"
assert "llama_guard_shield" in safety_provider.config assert "llama_guard_shield" in safety_provider.config

Some files were not shown because too many files have changed in this diff Show more