forked from phoenix-oss/llama-stack-mirror
Compare commits
85 commits
kvant
...
eval_api_f
Author | SHA1 | Date | |
---|---|---|---|
|
7854885e5a | ||
|
cbb53af701 | ||
|
bc0cd07008 | ||
|
7f12ea290f | ||
|
97e7717c9b | ||
|
81bc051411 | ||
|
5038f0e376 | ||
|
c2eb47d7e6 | ||
|
2723b05164 | ||
|
64388de068 | ||
|
3f8c7a584a | ||
|
45f6d5cd08 | ||
|
a54d757ade | ||
|
c1d18283d2 | ||
|
0048274ec0 | ||
|
42447729e4 | ||
|
a92756a4b7 | ||
|
08c0c5505e | ||
|
bf135f38b1 | ||
|
205a50f10b | ||
|
24d48b3692 | ||
|
913e6eb50f | ||
|
820b9a00c7 | ||
|
85cad639ca | ||
|
d994499f09 | ||
|
f107e3229b | ||
|
5e817cd56a | ||
|
398319fe7a | ||
|
238cdc4e69 | ||
|
b98497ee56 | ||
|
e860c536da | ||
|
a69759613a | ||
|
a8b0467ec3 | ||
|
5c0888c29a | ||
|
46f2ba5910 | ||
|
ade3391170 | ||
|
452b2b1284 | ||
|
66cd83fb58 | ||
|
62abe2899a | ||
|
cb492eba37 | ||
|
1860751655 | ||
|
c80d1f906b | ||
|
035b2dcb60 | ||
|
d34b70e3ab | ||
|
d9264a0925 | ||
|
63f1525165 | ||
|
5cf7779b8f | ||
|
a6fa3aa5a2 | ||
|
f2d93324e9 | ||
|
28b8c1c815 | ||
|
6f5df08ebf | ||
|
a568bf3f9d | ||
|
2c9d624910 | ||
|
72ccdc19a8 | ||
|
5cb0ad7d7f | ||
|
39f4dfbf50 | ||
|
c7d741d89e | ||
|
cba4842a87 | ||
|
0e2a13da9c | ||
|
7606e49dbc | ||
|
a6095820af | ||
|
89885fd2fa | ||
|
78ec3d98f6 | ||
|
8b80a77fae | ||
|
8a6fa41a93 | ||
|
0df33049e3 | ||
|
b4d118fc5c | ||
|
772339bebf | ||
|
4f6f0f6a91 | ||
|
4cc1958af9 | ||
|
09039eca57 | ||
|
790b2d5cc0 | ||
|
a3173e8284 | ||
|
18de4cd08a | ||
|
8942071b3b | ||
|
f840018088 | ||
|
31e3409909 | ||
|
1d80ec7f81 | ||
|
0abedd070c | ||
|
817331e76e | ||
|
0e47c65051 | ||
|
02aa9a1e85 | ||
|
0e8a53ab69 | ||
|
8592c2b48a | ||
|
bc551e6459 |
932 changed files with 81352 additions and 291202 deletions
|
@ -1,6 +0,0 @@
|
|||
[run]
|
||||
omit =
|
||||
*/tests/*
|
||||
*/llama_stack/providers/*
|
||||
*/llama_stack/templates/*
|
||||
.venv/*
|
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
|
@ -2,4 +2,4 @@
|
|||
|
||||
# These owners will be the default owners for everything in
|
||||
# the repo. Unless a later match takes precedence,
|
||||
* @ashwinb @yanxi0830 @hardikjshah @raghotham @ehhuang @terrytangyuan @leseb @bbrowning
|
||||
* @ashwinb @yanxi0830 @hardikjshah @dltn @raghotham @dineshyv @vladimirivic @sixianyi0721 @ehhuang @terrytangyuan @SLR722
|
||||
|
|
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
|
@ -1,8 +1,10 @@
|
|||
# What does this PR do?
|
||||
<!-- Provide a short summary of what this PR does and why. Link to relevant issues if applicable. -->
|
||||
[Provide a short summary of what this PR does and why. Link to relevant issues if applicable.]
|
||||
|
||||
<!-- If resolving an issue, uncomment and update the line below -->
|
||||
<!-- Closes #[issue-number] -->
|
||||
[//]: # (If resolving an issue, uncomment and update the line below)
|
||||
[//]: # (Closes #[issue-number])
|
||||
|
||||
## Test Plan
|
||||
<!-- Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.* -->
|
||||
[Describe the tests you ran to verify your changes with result summaries. *Provide clear instructions so the plan can be easily re-executed.*]
|
||||
|
||||
[//]: # (## Documentation)
|
||||
|
|
2
.github/TRIAGERS.md
vendored
2
.github/TRIAGERS.md
vendored
|
@ -1,2 +1,2 @@
|
|||
# This file documents Triage members in the Llama Stack community
|
||||
@bbrowning @booxter @franciscojavierarceo @leseb
|
||||
@franciscojavierarceo @leseb
|
||||
|
|
26
.github/actions/setup-ollama/action.yml
vendored
26
.github/actions/setup-ollama/action.yml
vendored
|
@ -1,26 +0,0 @@
|
|||
name: Setup Ollama
|
||||
description: Start Ollama and cache model
|
||||
inputs:
|
||||
models:
|
||||
description: Comma-separated list of models to pull
|
||||
default: "llama3.2:3b-instruct-fp16,all-minilm:latest"
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install and start Ollama
|
||||
shell: bash
|
||||
run: |
|
||||
# the ollama installer also starts the ollama service
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
|
||||
# Do NOT cache models - pulling the cache is actually slower than just pulling the model.
|
||||
# It takes ~45 seconds to pull the models from the cache and unpack it, but only 30 seconds to
|
||||
# pull them directly.
|
||||
# Maybe this is because the cache is being pulled at the same time by all the matrix jobs?
|
||||
- name: Pull requested models
|
||||
if: inputs.models != ''
|
||||
shell: bash
|
||||
run: |
|
||||
for model in $(echo "${{ inputs.models }}" | tr ',' ' '); do
|
||||
ollama pull "$model"
|
||||
done
|
22
.github/actions/setup-runner/action.yml
vendored
22
.github/actions/setup-runner/action.yml
vendored
|
@ -1,22 +0,0 @@
|
|||
name: Setup runner
|
||||
description: Prepare a runner for the tests (install uv, python, project dependencies, etc.)
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@6b9c6063abd6010835644d4c2e1bef4cf5cd0fca # v6.0.1
|
||||
with:
|
||||
python-version: "3.10"
|
||||
activate-environment: true
|
||||
version: 0.7.6
|
||||
|
||||
- name: Install dependencies
|
||||
shell: bash
|
||||
run: |
|
||||
uv sync --all-groups
|
||||
uv pip install ollama faiss-cpu
|
||||
# always test against the latest version of the client
|
||||
# TODO: this is not necessarily a good idea. we need to test against both published and latest
|
||||
# to find out backwards compatibility issues.
|
||||
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
||||
uv pip install -e .
|
1
.github/workflows/Dockerfile
vendored
1
.github/workflows/Dockerfile
vendored
|
@ -1 +0,0 @@
|
|||
FROM localhost:5000/distribution-kvant:dev
|
|
@ -15,13 +15,13 @@ jobs:
|
|||
pull-requests: write # for peter-evans/create-pull-request to create a PR
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
fetch-depth: 0
|
||||
- run: |
|
||||
python ./scripts/gen-changelog.py
|
||||
- uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
|
||||
- uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
title: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
|
||||
commit-message: 'docs: update CHANGELOG.md for ${{ github.ref_name }}'
|
73
.github/workflows/ci-playground.yaml
vendored
73
.github/workflows/ci-playground.yaml
vendored
|
@ -1,73 +0,0 @@
|
|||
name: Build and Push playground container
|
||||
run-name: Build and Push playground container
|
||||
on:
|
||||
workflow_dispatch:
|
||||
#schedule:
|
||||
# - cron: "0 10 * * *"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- kvant
|
||||
tags:
|
||||
- 'v*'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- kvant
|
||||
env:
|
||||
IMAGE: git.kvant.cloud/${{github.repository}}-playground
|
||||
jobs:
|
||||
build-playground:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set current time
|
||||
uses: https://github.com/gerred/actions/current-time@master
|
||||
id: current_time
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to git.kvant.cloud registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: git.kvant.cloud
|
||||
username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
|
||||
password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
# list of Docker images to use as base name for tags
|
||||
images: |
|
||||
${{env.IMAGE}}
|
||||
# generate Docker tags based on the following events/attributes
|
||||
tags: |
|
||||
type=schedule
|
||||
type=ref,event=branch
|
||||
type=ref,event=pr
|
||||
type=ref,event=tag
|
||||
type=semver,pattern={{version}}
|
||||
|
||||
- name: Build and push to gitea registry
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
context: .
|
||||
file: llama_stack/distribution/ui/Containerfile
|
||||
provenance: mode=max
|
||||
sbom: true
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.current_time.outputs.time }}
|
||||
cache-from: |
|
||||
type=registry,ref=${{ env.IMAGE }}:buildcache
|
||||
type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
|
||||
type=registry,ref=${{ env.IMAGE }}:main
|
||||
cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
|
98
.github/workflows/ci.yaml
vendored
98
.github/workflows/ci.yaml
vendored
|
@ -1,98 +0,0 @@
|
|||
name: Build and Push container
|
||||
run-name: Build and Push container
|
||||
on:
|
||||
workflow_dispatch:
|
||||
#schedule:
|
||||
# - cron: "0 10 * * *"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- kvant
|
||||
tags:
|
||||
- 'v*'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- kvant
|
||||
env:
|
||||
IMAGE: git.kvant.cloud/${{github.repository}}
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
registry:
|
||||
image: registry:2
|
||||
ports:
|
||||
- 5000:5000
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set current time
|
||||
uses: https://github.com/gerred/actions/current-time@master
|
||||
id: current_time
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
driver-opts: network=host
|
||||
|
||||
- name: Login to git.kvant.cloud registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: git.kvant.cloud
|
||||
username: ${{ vars.ORG_PACKAGE_WRITER_USERNAME }}
|
||||
password: ${{ secrets.ORG_PACKAGE_WRITER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
# list of Docker images to use as base name for tags
|
||||
images: |
|
||||
${{env.IMAGE}}
|
||||
# generate Docker tags based on the following events/attributes
|
||||
tags: |
|
||||
type=schedule
|
||||
type=ref,event=branch
|
||||
type=ref,event=pr
|
||||
type=ref,event=tag
|
||||
type=semver,pattern={{version}}
|
||||
|
||||
- name: Install uv
|
||||
uses: https://github.com/astral-sh/setup-uv@v5
|
||||
with:
|
||||
# Install a specific version of uv.
|
||||
version: "0.7.8"
|
||||
|
||||
- name: Build
|
||||
env:
|
||||
USE_COPY_NOT_MOUNT: true
|
||||
LLAMA_STACK_DIR: .
|
||||
run: |
|
||||
uvx --from . llama stack build --template kvant --image-type container
|
||||
|
||||
# docker tag distribution-kvant:dev ${{env.IMAGE}}:kvant
|
||||
# docker push ${{env.IMAGE}}:kvant
|
||||
|
||||
docker tag distribution-kvant:dev localhost:5000/distribution-kvant:dev
|
||||
docker push localhost:5000/distribution-kvant:dev
|
||||
|
||||
- name: Build and push to gitea registry
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
push: ${{ github.event_name != 'pull_request' }}
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
context: .github/workflows
|
||||
provenance: mode=max
|
||||
sbom: true
|
||||
build-args: |
|
||||
BUILD_DATE=${{ steps.current_time.outputs.time }}
|
||||
cache-from: |
|
||||
type=registry,ref=${{ env.IMAGE }}:buildcache
|
||||
type=registry,ref=${{ env.IMAGE }}:${{ github.ref_name }}
|
||||
type=registry,ref=${{ env.IMAGE }}:main
|
||||
cache-to: type=registry,ref=${{ env.IMAGE }}:buildcache,mode=max,image-manifest=true
|
|
@ -140,7 +140,7 @@ jobs:
|
|||
#######################
|
||||
- name: "Checkout 'meta-llama/llama-stack' repository"
|
||||
id: checkout_repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ inputs.branch }}
|
||||
|
||||
|
@ -302,7 +302,7 @@ jobs:
|
|||
- name: "PR - Test Summary"
|
||||
id: pr_test_summary_create
|
||||
if: github.event_name == 'pull_request_target'
|
||||
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
|
||||
uses: test-summary/action@v2
|
||||
with:
|
||||
paths: "${{ github.workspace }}/merged-test-results.xml"
|
||||
output: test-summary.md
|
||||
|
@ -310,7 +310,7 @@ jobs:
|
|||
- name: "PR - Upload Test Summary"
|
||||
id: pr_test_summary_upload
|
||||
if: github.event_name == 'pull_request_target'
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test-summary
|
||||
path: test-summary.md
|
||||
|
@ -320,7 +320,7 @@ jobs:
|
|||
- name: "PR - Update comment"
|
||||
id: pr_update_comment
|
||||
if: github.event_name == 'pull_request_target'
|
||||
uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
|
||||
uses: thollander/actions-comment-pull-request@v3
|
||||
with:
|
||||
filePath: test-summary.md
|
||||
|
||||
|
@ -350,6 +350,6 @@ jobs:
|
|||
- name: "Manual - Test Summary"
|
||||
id: manual_test_summary
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
uses: test-summary/action@31493c76ec9e7aa675f1585d3ed6f1da69269a86 # v2.4
|
||||
uses: test-summary/action@v2
|
||||
with:
|
||||
paths: "${{ github.workspace }}/merged-test-results.xml"
|
101
.github/workflows/integration-tests.yml
vendored
Normal file
101
.github/workflows/integration-tests.yml
vendored
Normal file
|
@ -0,0 +1,101 @@
|
|||
name: Integration Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'distributions/**'
|
||||
- 'llama_stack/**'
|
||||
- 'tests/integration/**'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- 'requirements.txt'
|
||||
- '.github/workflows/integration-tests.yml' # This workflow
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test-matrix:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
# Listing tests manually since some of them currently fail
|
||||
# TODO: generate matrix list from tests/integration when fixed
|
||||
test-type: [inference, datasets, inspect, scoring, post_training, providers]
|
||||
fail-fast: false # we want to run all tests regardless of failure
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Install Ollama
|
||||
run: |
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
|
||||
- name: Pull Ollama image
|
||||
run: |
|
||||
ollama pull llama3.2:3b-instruct-fp16
|
||||
|
||||
- name: Start Ollama in background
|
||||
run: |
|
||||
nohup ollama run llama3.2:3b-instruct-fp16 > ollama.log 2>&1 &
|
||||
|
||||
- name: Set Up Environment and Install Dependencies
|
||||
run: |
|
||||
uv sync --extra dev --extra test
|
||||
uv pip install ollama faiss-cpu
|
||||
# always test against the latest version of the client
|
||||
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
||||
uv pip install -e .
|
||||
llama stack build --template ollama --image-type venv
|
||||
|
||||
- name: Wait for Ollama to start
|
||||
run: |
|
||||
echo "Waiting for Ollama..."
|
||||
for i in {1..30}; do
|
||||
if curl -s http://localhost:11434 | grep -q "Ollama is running"; then
|
||||
echo "Ollama is running!"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Ollama failed to start"
|
||||
ollama ps
|
||||
ollama.log
|
||||
exit 1
|
||||
|
||||
- name: Start Llama Stack server in background
|
||||
env:
|
||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv > server.log 2>&1 &
|
||||
|
||||
- name: Wait for Llama Stack server to be ready
|
||||
run: |
|
||||
echo "Waiting for Llama Stack server..."
|
||||
for i in {1..30}; do
|
||||
if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
|
||||
echo "Llama Stack server is up!"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Llama Stack server failed to start"
|
||||
cat server.log
|
||||
exit 1
|
||||
|
||||
- name: Run Integration Tests
|
||||
env:
|
||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
run: |
|
||||
uv run pytest -v tests/integration/${{ matrix.test-type }} --stack-config=ollama --text-model="meta-llama/Llama-3.2-3B-Instruct" --embedding-model=all-MiniLM-L6-v2
|
33
.github/workflows/pre-commit.yml
vendored
Normal file
33
.github/workflows/pre-commit.yml
vendored
Normal file
|
@ -0,0 +1,33 @@
|
|||
name: Pre-commit
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: pip
|
||||
cache-dependency-path: |
|
||||
**/requirements*.txt
|
||||
.pre-commit-config.yaml
|
||||
|
||||
- uses: pre-commit/action@v3.0.1
|
||||
|
||||
- name: Verify if there are any diff files after pre-commit
|
||||
run: |
|
||||
git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
|
83
.github/workflows/providers-build.yml
vendored
Normal file
83
.github/workflows/providers-build.yml
vendored
Normal file
|
@ -0,0 +1,83 @@
|
|||
name: Test Llama Stack Build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'llama_stack/cli/stack/build.py'
|
||||
- 'llama_stack/cli/stack/_build.py'
|
||||
- 'llama_stack/distribution/build.*'
|
||||
- 'llama_stack/distribution/*.sh'
|
||||
- '.github/workflows/providers-build.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'llama_stack/cli/stack/build.py'
|
||||
- 'llama_stack/cli/stack/_build.py'
|
||||
- 'llama_stack/distribution/build.*'
|
||||
- 'llama_stack/distribution/*.sh'
|
||||
- '.github/workflows/providers-build.yml'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
generate-matrix:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
templates: ${{ steps.set-matrix.outputs.templates }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Generate Template List
|
||||
id: set-matrix
|
||||
run: |
|
||||
templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
|
||||
echo "templates=$templates" >> "$GITHUB_OUTPUT"
|
||||
|
||||
build:
|
||||
needs: generate-matrix
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
|
||||
image-type: [venv, container]
|
||||
fail-fast: false # We want to run all jobs even if some fail
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.10'
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Install LlamaStack
|
||||
run: |
|
||||
uv venv
|
||||
source .venv/bin/activate
|
||||
uv pip install -e .
|
||||
|
||||
- name: Print build dependencies
|
||||
run: |
|
||||
uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
|
||||
|
||||
- name: Run Llama Stack Build
|
||||
run: |
|
||||
# USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
|
||||
# LLAMA_STACK_DIR is set to the current directory so we are building from the source
|
||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
|
||||
|
||||
- name: Print dependencies in the image
|
||||
if: matrix.image-type == 'venv'
|
||||
run: |
|
||||
source test/bin/activate
|
||||
uv pip list
|
|
@ -20,6 +20,6 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check PR Title's semantic conformance
|
||||
uses: amannn/action-semantic-pull-request@0723387faaf9b38adef4775cd42cfd5155ed6017 # v5.5.3
|
||||
uses: amannn/action-semantic-pull-request@v5
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
@ -22,7 +22,7 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stale Action
|
||||
uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
||||
uses: actions/stale@v9
|
||||
with:
|
||||
stale-issue-label: 'stale'
|
||||
stale-issue-message: >
|
|
@ -20,7 +20,7 @@ jobs:
|
|||
matrix:
|
||||
provider: [fireworks, together]
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.inputs.commit_sha }}
|
||||
|
|
@ -6,6 +6,7 @@ on:
|
|||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'distributions/**'
|
||||
- 'llama_stack/**'
|
||||
- 'tests/unit/**'
|
||||
- 'uv.lock'
|
||||
|
@ -30,11 +31,17 @@ jobs:
|
|||
- "3.12"
|
||||
- "3.13"
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
- name: Set up Python ${{ matrix.python }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
- uses: astral-sh/setup-uv@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
enable-cache: false
|
||||
|
||||
- name: Run unit tests
|
||||
run: |
|
||||
|
@ -42,7 +49,7 @@ jobs:
|
|||
|
||||
- name: Upload test results
|
||||
if: always()
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: test-results-${{ matrix.python }}
|
||||
path: |
|
|
@ -14,8 +14,6 @@ on:
|
|||
- 'docs/**'
|
||||
- 'pyproject.toml'
|
||||
- '.github/workflows/update-readthedocs.yml'
|
||||
tags:
|
||||
- '*'
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
|
@ -35,10 +33,18 @@ jobs:
|
|||
TOKEN: ${{ secrets.READTHEDOCS_TOKEN }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install the latest version of uv
|
||||
uses: astral-sh/setup-uv@v5
|
||||
|
||||
- name: Sync with uv
|
||||
run: uv sync --extra docs
|
||||
|
||||
- name: Build HTML
|
||||
run: |
|
||||
|
@ -55,10 +61,7 @@ jobs:
|
|||
|
||||
response=$(curl -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"token\": \"$TOKEN\",
|
||||
\"version\": \"$GITHUB_REF_NAME\"
|
||||
}" \
|
||||
-d "{\"token\": \"$TOKEN\"}" \
|
||||
https://readthedocs.org/api/v2/webhook/llama-stack/289768/)
|
||||
|
||||
echo "Response: $response"
|
26
.github/workflows_upstream/install-script-ci.yml
vendored
26
.github/workflows_upstream/install-script-ci.yml
vendored
|
@ -1,26 +0,0 @@
|
|||
name: Installer CI
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'install.sh'
|
||||
push:
|
||||
paths:
|
||||
- 'install.sh'
|
||||
schedule:
|
||||
- cron: '0 2 * * *' # every day at 02:00 UTC
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
|
||||
- name: Run ShellCheck on install.sh
|
||||
run: shellcheck install.sh
|
||||
smoke-test:
|
||||
needs: lint
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2
|
||||
- name: Run installer end-to-end
|
||||
run: ./install.sh
|
|
@ -1,132 +0,0 @@
|
|||
name: Integration Auth Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'distributions/**'
|
||||
- 'llama_stack/**'
|
||||
- 'tests/integration/**'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- 'requirements.txt'
|
||||
- '.github/workflows/integration-auth-tests.yml' # This workflow
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test-matrix:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
auth-provider: [oauth2_token]
|
||||
fail-fast: false # we want to run all tests regardless of failure
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
|
||||
- name: Build Llama Stack
|
||||
run: |
|
||||
llama stack build --template ollama --image-type venv
|
||||
|
||||
- name: Install minikube
|
||||
if: ${{ matrix.auth-provider == 'kubernetes' }}
|
||||
uses: medyagh/setup-minikube@cea33675329b799adccc9526aa5daccc26cd5052 # v0.0.19
|
||||
|
||||
- name: Start minikube
|
||||
if: ${{ matrix.auth-provider == 'oauth2_token' }}
|
||||
run: |
|
||||
minikube start
|
||||
kubectl get pods -A
|
||||
|
||||
- name: Configure Kube Auth
|
||||
if: ${{ matrix.auth-provider == 'oauth2_token' }}
|
||||
run: |
|
||||
kubectl create namespace llama-stack
|
||||
kubectl create serviceaccount llama-stack-auth -n llama-stack
|
||||
kubectl create rolebinding llama-stack-auth-rolebinding --clusterrole=admin --serviceaccount=llama-stack:llama-stack-auth -n llama-stack
|
||||
kubectl create token llama-stack-auth -n llama-stack > llama-stack-auth-token
|
||||
cat <<EOF | kubectl apply -f -
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: allow-anonymous-openid
|
||||
rules:
|
||||
- nonResourceURLs: ["/openid/v1/jwks"]
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: allow-anonymous-openid
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: allow-anonymous-openid
|
||||
subjects:
|
||||
- kind: User
|
||||
name: system:anonymous
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
EOF
|
||||
|
||||
- name: Set Kubernetes Config
|
||||
if: ${{ matrix.auth-provider == 'oauth2_token' }}
|
||||
run: |
|
||||
echo "KUBERNETES_API_SERVER_URL=$(kubectl get --raw /.well-known/openid-configuration| jq -r .jwks_uri)" >> $GITHUB_ENV
|
||||
echo "KUBERNETES_CA_CERT_PATH=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.certificate-authority}')" >> $GITHUB_ENV
|
||||
echo "KUBERNETES_ISSUER=$(kubectl get --raw /.well-known/openid-configuration| jq -r .issuer)" >> $GITHUB_ENV
|
||||
echo "KUBERNETES_AUDIENCE=$(kubectl create token llama-stack-auth -n llama-stack --duration=1h | cut -d. -f2 | base64 -d | jq -r '.aud[0]')" >> $GITHUB_ENV
|
||||
|
||||
- name: Set Kube Auth Config and run server
|
||||
env:
|
||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
if: ${{ matrix.auth-provider == 'oauth2_token' }}
|
||||
run: |
|
||||
run_dir=$(mktemp -d)
|
||||
cat <<'EOF' > $run_dir/run.yaml
|
||||
version: '2'
|
||||
image_name: kube
|
||||
apis: []
|
||||
providers: {}
|
||||
server:
|
||||
port: 8321
|
||||
EOF
|
||||
yq eval '.server.auth = {"provider_type": "${{ matrix.auth-provider }}"}' -i $run_dir/run.yaml
|
||||
yq eval '.server.auth.config = {"tls_cafile": "${{ env.KUBERNETES_CA_CERT_PATH }}", "issuer": "${{ env.KUBERNETES_ISSUER }}", "audience": "${{ env.KUBERNETES_AUDIENCE }}"}' -i $run_dir/run.yaml
|
||||
yq eval '.server.auth.config.jwks = {"uri": "${{ env.KUBERNETES_API_SERVER_URL }}"}' -i $run_dir/run.yaml
|
||||
cat $run_dir/run.yaml
|
||||
|
||||
nohup uv run llama stack run $run_dir/run.yaml --image-type venv > server.log 2>&1 &
|
||||
|
||||
- name: Wait for Llama Stack server to be ready
|
||||
run: |
|
||||
echo "Waiting for Llama Stack server..."
|
||||
for i in {1..30}; do
|
||||
if curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://localhost:8321/v1/health | grep -q "OK"; then
|
||||
echo "Llama Stack server is up!"
|
||||
if grep -q "Enabling authentication with provider: ${{ matrix.auth-provider }}" server.log; then
|
||||
echo "Llama Stack server is configured to use ${{ matrix.auth-provider }} auth"
|
||||
exit 0
|
||||
else
|
||||
echo "Llama Stack server is not configured to use ${{ matrix.auth-provider }} auth"
|
||||
cat server.log
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Llama Stack server failed to start"
|
||||
cat server.log
|
||||
exit 1
|
||||
|
||||
- name: Test auth
|
||||
run: |
|
||||
curl -s -L -H "Authorization: Bearer $(cat llama-stack-auth-token)" http://127.0.0.1:8321/v1/providers|jq
|
116
.github/workflows_upstream/integration-tests.yml
vendored
116
.github/workflows_upstream/integration-tests.yml
vendored
|
@ -1,116 +0,0 @@
|
|||
name: Integration Tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'llama_stack/**'
|
||||
- 'tests/integration/**'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- 'requirements.txt'
|
||||
- '.github/workflows/integration-tests.yml' # This workflow
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test-matrix:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
# Listing tests manually since some of them currently fail
|
||||
# TODO: generate matrix list from tests/integration when fixed
|
||||
test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime]
|
||||
client-type: [library, http]
|
||||
fail-fast: false # we want to run all tests regardless of failure
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
|
||||
- name: Setup ollama
|
||||
uses: ./.github/actions/setup-ollama
|
||||
|
||||
- name: Build Llama Stack
|
||||
run: |
|
||||
llama stack build --template ollama --image-type venv
|
||||
|
||||
- name: Start Llama Stack server in background
|
||||
if: matrix.client-type == 'http'
|
||||
env:
|
||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
run: |
|
||||
LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv &
|
||||
|
||||
- name: Wait for Llama Stack server to be ready
|
||||
if: matrix.client-type == 'http'
|
||||
run: |
|
||||
echo "Waiting for Llama Stack server..."
|
||||
for i in {1..30}; do
|
||||
if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
|
||||
echo "Llama Stack server is up!"
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "Llama Stack server failed to start"
|
||||
cat server.log
|
||||
exit 1
|
||||
|
||||
- name: Verify Ollama status is OK
|
||||
if: matrix.client-type == 'http'
|
||||
run: |
|
||||
echo "Verifying Ollama status..."
|
||||
ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
|
||||
echo "Ollama status: $ollama_status"
|
||||
if [ "$ollama_status" != "OK" ]; then
|
||||
echo "Ollama health check failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Check Storage and Memory Available Before Tests
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
free -h
|
||||
df -h
|
||||
|
||||
- name: Run Integration Tests
|
||||
env:
|
||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
run: |
|
||||
if [ "${{ matrix.client-type }}" == "library" ]; then
|
||||
stack_config="ollama"
|
||||
else
|
||||
stack_config="http://localhost:8321"
|
||||
fi
|
||||
uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
|
||||
-k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
|
||||
--text-model="meta-llama/Llama-3.2-3B-Instruct" \
|
||||
--embedding-model=all-MiniLM-L6-v2
|
||||
|
||||
- name: Check Storage and Memory Available After Tests
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
free -h
|
||||
df -h
|
||||
|
||||
- name: Write ollama logs to file
|
||||
if: ${{ always() }}
|
||||
run: |
|
||||
sudo journalctl -u ollama.service > ollama.log
|
||||
|
||||
- name: Upload all logs to artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}
|
||||
path: |
|
||||
*.log
|
||||
retention-days: 1
|
45
.github/workflows_upstream/pre-commit.yml
vendored
45
.github/workflows_upstream/pre-commit.yml
vendored
|
@ -1,45 +0,0 @@
|
|||
name: Pre-commit
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches: [main]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: pip
|
||||
cache-dependency-path: |
|
||||
**/requirements*.txt
|
||||
.pre-commit-config.yaml
|
||||
|
||||
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||
env:
|
||||
SKIP: no-commit-to-branch
|
||||
RUFF_OUTPUT_FORMAT: github
|
||||
|
||||
- name: Verify if there are any diff files after pre-commit
|
||||
run: |
|
||||
git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
|
||||
|
||||
- name: Verify if there are any new files after pre-commit
|
||||
run: |
|
||||
unstaged_files=$(git ls-files --others --exclude-standard)
|
||||
if [ -n "$unstaged_files" ]; then
|
||||
echo "There are uncommitted new files, run pre-commit locally and commit again"
|
||||
echo "$unstaged_files"
|
||||
exit 1
|
||||
fi
|
147
.github/workflows_upstream/providers-build.yml
vendored
147
.github/workflows_upstream/providers-build.yml
vendored
|
@ -1,147 +0,0 @@
|
|||
name: Test Llama Stack Build
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- 'llama_stack/cli/stack/build.py'
|
||||
- 'llama_stack/cli/stack/_build.py'
|
||||
- 'llama_stack/distribution/build.*'
|
||||
- 'llama_stack/distribution/*.sh'
|
||||
- '.github/workflows/providers-build.yml'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'llama_stack/cli/stack/build.py'
|
||||
- 'llama_stack/cli/stack/_build.py'
|
||||
- 'llama_stack/distribution/build.*'
|
||||
- 'llama_stack/distribution/*.sh'
|
||||
- '.github/workflows/providers-build.yml'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
generate-matrix:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
templates: ${{ steps.set-matrix.outputs.templates }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Generate Template List
|
||||
id: set-matrix
|
||||
run: |
|
||||
templates=$(ls llama_stack/templates/*/*build.yaml | awk -F'/' '{print $(NF-1)}' | jq -R -s -c 'split("\n")[:-1]')
|
||||
echo "templates=$templates" >> "$GITHUB_OUTPUT"
|
||||
|
||||
build:
|
||||
needs: generate-matrix
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
template: ${{ fromJson(needs.generate-matrix.outputs.templates) }}
|
||||
image-type: [venv, container]
|
||||
fail-fast: false # We want to run all jobs even if some fail
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
|
||||
- name: Print build dependencies
|
||||
run: |
|
||||
uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test --print-deps-only
|
||||
|
||||
- name: Run Llama Stack Build
|
||||
run: |
|
||||
# USE_COPY_NOT_MOUNT is set to true since mounting is not supported by docker buildx, we use COPY instead
|
||||
# LLAMA_STACK_DIR is set to the current directory so we are building from the source
|
||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --template ${{ matrix.template }} --image-type ${{ matrix.image-type }} --image-name test
|
||||
|
||||
- name: Print dependencies in the image
|
||||
if: matrix.image-type == 'venv'
|
||||
run: |
|
||||
uv pip list
|
||||
|
||||
build-single-provider:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
|
||||
- name: Build a single provider
|
||||
run: |
|
||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --image-type venv --image-name test --providers inference=remote::ollama
|
||||
|
||||
build-custom-container-distribution:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
|
||||
- name: Build a single provider
|
||||
run: |
|
||||
yq -i '.image_type = "container"' llama_stack/templates/starter/build.yaml
|
||||
yq -i '.image_name = "test"' llama_stack/templates/starter/build.yaml
|
||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config llama_stack/templates/starter/build.yaml
|
||||
|
||||
- name: Inspect the container image entrypoint
|
||||
run: |
|
||||
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
||||
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
||||
echo "Entrypoint: $entrypoint"
|
||||
if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
|
||||
echo "Entrypoint is not correct"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
build-ubi9-container-distribution:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
|
||||
- name: Pin template to UBI9 base
|
||||
run: |
|
||||
yq -i '
|
||||
.image_type = "container" |
|
||||
.image_name = "ubi9-test" |
|
||||
.distribution_spec.container_image = "registry.access.redhat.com/ubi9:latest"
|
||||
' llama_stack/templates/starter/build.yaml
|
||||
|
||||
- name: Build dev container (UBI9)
|
||||
env:
|
||||
USE_COPY_NOT_MOUNT: "true"
|
||||
LLAMA_STACK_DIR: "."
|
||||
run: |
|
||||
uv run llama stack build --config llama_stack/templates/starter/build.yaml
|
||||
|
||||
- name: Inspect UBI9 image
|
||||
run: |
|
||||
IMAGE_ID=$(docker images --format "{{.Repository}}:{{.Tag}}" | head -n 1)
|
||||
entrypoint=$(docker inspect --format '{{ .Config.Entrypoint }}' $IMAGE_ID)
|
||||
echo "Entrypoint: $entrypoint"
|
||||
if [ "$entrypoint" != "[python -m llama_stack.distribution.server.server --config /app/run.yaml]" ]; then
|
||||
echo "Entrypoint is not correct"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking /etc/os-release in $IMAGE_ID"
|
||||
docker run --rm --entrypoint sh "$IMAGE_ID" -c \
|
||||
'source /etc/os-release && echo "$ID"' \
|
||||
| grep -qE '^(rhel|ubi)$' \
|
||||
|| { echo "Base image is not UBI 9!"; exit 1; }
|
|
@ -1,71 +0,0 @@
|
|||
name: Test External Providers
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'llama_stack/**'
|
||||
- 'tests/integration/**'
|
||||
- 'uv.lock'
|
||||
- 'pyproject.toml'
|
||||
- 'requirements.txt'
|
||||
- '.github/workflows/test-external-providers.yml' # This workflow
|
||||
|
||||
jobs:
|
||||
test-external-providers:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
image-type: [venv]
|
||||
# We don't do container yet, it's tricky to install a package from the host into the
|
||||
# container and point 'uv pip install' to the correct path...
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Install dependencies
|
||||
uses: ./.github/actions/setup-runner
|
||||
|
||||
- name: Apply image type to config file
|
||||
run: |
|
||||
yq -i '.image_type = "${{ matrix.image-type }}"' tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
|
||||
cat tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
|
||||
|
||||
- name: Setup directory for Ollama custom provider
|
||||
run: |
|
||||
mkdir -p tests/external-provider/llama-stack-provider-ollama/src/
|
||||
cp -a llama_stack/providers/remote/inference/ollama/ tests/external-provider/llama-stack-provider-ollama/src/llama_stack_provider_ollama
|
||||
|
||||
- name: Create provider configuration
|
||||
run: |
|
||||
mkdir -p /home/runner/.llama/providers.d/remote/inference
|
||||
cp tests/external-provider/llama-stack-provider-ollama/custom_ollama.yaml /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml
|
||||
|
||||
- name: Build distro from config file
|
||||
run: |
|
||||
USE_COPY_NOT_MOUNT=true LLAMA_STACK_DIR=. uv run llama stack build --config tests/external-provider/llama-stack-provider-ollama/custom-distro.yaml
|
||||
|
||||
- name: Start Llama Stack server in background
|
||||
if: ${{ matrix.image-type }} == 'venv'
|
||||
env:
|
||||
INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
|
||||
run: |
|
||||
uv run pip list
|
||||
nohup uv run --active llama stack run tests/external-provider/llama-stack-provider-ollama/run.yaml --image-type ${{ matrix.image-type }} > server.log 2>&1 &
|
||||
|
||||
- name: Wait for Llama Stack server to be ready
|
||||
run: |
|
||||
for i in {1..30}; do
|
||||
if ! grep -q "remote::custom_ollama from /home/runner/.llama/providers.d/remote/inference/custom_ollama.yaml" server.log; then
|
||||
echo "Waiting for Llama Stack server to load the provider..."
|
||||
sleep 1
|
||||
else
|
||||
echo "Provider loaded"
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
echo "Provider failed to load"
|
||||
cat server.log
|
||||
exit 1
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -6,7 +6,6 @@ dev_requirements.txt
|
|||
build
|
||||
.DS_Store
|
||||
llama_stack/configs/*
|
||||
.cursor/
|
||||
xcuserdata/
|
||||
*.hmap
|
||||
.DS_Store
|
||||
|
@ -24,4 +23,3 @@ venv/
|
|||
pytest-report.xml
|
||||
.coverage
|
||||
.python-version
|
||||
data
|
||||
|
|
|
@ -15,18 +15,6 @@ repos:
|
|||
args: ['--maxkb=1000']
|
||||
- id: end-of-file-fixer
|
||||
exclude: '^(.*\.svg)$'
|
||||
- id: no-commit-to-branch
|
||||
- id: check-yaml
|
||||
args: ["--unsafe"]
|
||||
- id: detect-private-key
|
||||
- id: requirements-txt-fixer
|
||||
- id: mixed-line-ending
|
||||
args: [--fix=lf] # Forces to replace line ending by LF (line feed)
|
||||
- id: check-executables-have-shebangs
|
||||
- id: check-json
|
||||
- id: check-shebang-scripts-are-executable
|
||||
- id: check-symlinks
|
||||
- id: check-toml
|
||||
|
||||
- repo: https://github.com/Lucas-C/pre-commit-hooks
|
||||
rev: v1.5.4
|
||||
|
@ -53,7 +41,7 @@ repos:
|
|||
- black==24.3.0
|
||||
|
||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||
rev: 0.7.8
|
||||
rev: 0.6.3
|
||||
hooks:
|
||||
- id: uv-lock
|
||||
- id: uv-export
|
||||
|
@ -61,7 +49,6 @@ repos:
|
|||
"--frozen",
|
||||
"--no-hashes",
|
||||
"--no-emit-project",
|
||||
"--no-default-groups",
|
||||
"--output-file=requirements.txt"
|
||||
]
|
||||
|
||||
|
@ -89,29 +76,24 @@ repos:
|
|||
- id: distro-codegen
|
||||
name: Distribution Template Codegen
|
||||
additional_dependencies:
|
||||
- uv==0.7.8
|
||||
entry: uv run --group codegen ./scripts/distro_codegen.py
|
||||
- uv==0.6.0
|
||||
entry: uv run --extra codegen ./scripts/distro_codegen.py
|
||||
language: python
|
||||
pass_filenames: false
|
||||
require_serial: true
|
||||
files: ^llama_stack/templates/.*$|^llama_stack/providers/.*/inference/.*/models\.py$
|
||||
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: openapi-codegen
|
||||
name: API Spec Codegen
|
||||
additional_dependencies:
|
||||
- uv==0.7.8
|
||||
entry: sh -c 'uv run ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
|
||||
- uv==0.6.2
|
||||
entry: sh -c 'uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh > /dev/null'
|
||||
language: python
|
||||
pass_filenames: false
|
||||
require_serial: true
|
||||
files: ^llama_stack/apis/|^docs/openapi_generator/
|
||||
- id: check-workflows-use-hashes
|
||||
name: Check GitHub Actions use SHA-pinned actions
|
||||
entry: ./scripts/check-workflows-use-hashes.sh
|
||||
language: system
|
||||
pass_filenames: false
|
||||
require_serial: true
|
||||
always_run: true
|
||||
files: ^\.github/workflows/.*\.ya?ml$
|
||||
|
||||
ci:
|
||||
autofix_commit_msg: 🎨 [pre-commit.ci] Auto format from pre-commit.com hooks
|
||||
|
|
|
@ -5,21 +5,28 @@
|
|||
# Required
|
||||
version: 2
|
||||
|
||||
# Build documentation in the "docs/" directory with Sphinx
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
|
||||
# Set the OS, Python version and other tools you might need
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.12"
|
||||
jobs:
|
||||
pre_create_environment:
|
||||
- asdf plugin add uv
|
||||
- asdf install uv latest
|
||||
- asdf global uv latest
|
||||
create_environment:
|
||||
- uv venv "${READTHEDOCS_VIRTUALENV_PATH}"
|
||||
install:
|
||||
- UV_PROJECT_ENVIRONMENT="${READTHEDOCS_VIRTUALENV_PATH}" uv sync --frozen --group docs
|
||||
# You can also specify other tool versions:
|
||||
# nodejs: "19"
|
||||
# rust: "1.64"
|
||||
# golang: "1.19"
|
||||
|
||||
# Build documentation in the "docs/" directory with Sphinx
|
||||
sphinx:
|
||||
configuration: docs/source/conf.py
|
||||
|
||||
# Optionally build your docs in additional formats such as PDF and ePub
|
||||
# formats:
|
||||
# - pdf
|
||||
# - epub
|
||||
|
||||
# Optional but recommended, declare the Python requirements required
|
||||
# to build your documentation
|
||||
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements.txt
|
||||
|
|
107
CHANGELOG.md
107
CHANGELOG.md
|
@ -1,112 +1,5 @@
|
|||
# Changelog
|
||||
|
||||
# v0.2.7
|
||||
Published on: 2025-05-16T20:38:10Z
|
||||
|
||||
## Highlights
|
||||
|
||||
This is a small update. But a couple highlights:
|
||||
|
||||
* feat: function tools in OpenAI Responses by @bbrowning in https://github.com/meta-llama/llama-stack/pull/2094, getting closer to ready. Streaming is the next missing piece.
|
||||
* feat: Adding support for customizing chunk context in RAG insertion and querying by @franciscojavierarceo in https://github.com/meta-llama/llama-stack/pull/2134
|
||||
* feat: scaffolding for Llama Stack UI by @ehhuang in https://github.com/meta-llama/llama-stack/pull/2149, more to come in the coming releases.
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.2.6
|
||||
Published on: 2025-05-12T18:06:52Z
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.2.5
|
||||
Published on: 2025-05-04T20:16:49Z
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.2.4
|
||||
Published on: 2025-04-29T17:26:01Z
|
||||
|
||||
## Highlights
|
||||
|
||||
* One-liner to install and run Llama Stack yay! by @reluctantfuturist in https://github.com/meta-llama/llama-stack/pull/1383
|
||||
* support for NVIDIA NeMo datastore by @raspawar in https://github.com/meta-llama/llama-stack/pull/1852
|
||||
* (yuge!) Kubernetes authentication by @leseb in https://github.com/meta-llama/llama-stack/pull/1778
|
||||
* (yuge!) OpenAI Responses API by @bbrowning in https://github.com/meta-llama/llama-stack/pull/1989
|
||||
* add api.llama provider, llama-guard-4 model by @ashwinb in https://github.com/meta-llama/llama-stack/pull/2058
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.2.3
|
||||
Published on: 2025-04-25T22:46:21Z
|
||||
|
||||
## Highlights
|
||||
|
||||
* OpenAI compatible inference endpoints and client-SDK support. `client.chat.completions.create()` now works.
|
||||
* significant improvements and functionality added to the nVIDIA distribution
|
||||
* many improvements to the test verification suite.
|
||||
* new inference providers: Ramalama, IBM WatsonX
|
||||
* many improvements to the Playground UI
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.2.2
|
||||
Published on: 2025-04-13T01:19:49Z
|
||||
|
||||
## Main changes
|
||||
|
||||
- Bring Your Own Provider (@leseb) - use out-of-tree provider code to execute the distribution server
|
||||
- OpenAI compatible inference API in progress (@bbrowning)
|
||||
- Provider verifications (@ehhuang)
|
||||
- Many updates and fixes to playground
|
||||
- Several llama4 related fixes
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.2.1
|
||||
Published on: 2025-04-05T23:13:00Z
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.2.0
|
||||
Published on: 2025-04-05T19:04:29Z
|
||||
|
||||
## Llama 4 Support
|
||||
|
||||
Checkout more at https://www.llama.com
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.1.9
|
||||
Published on: 2025-03-29T00:52:23Z
|
||||
|
||||
### Build and Test Agents
|
||||
* Agents: Entire document context with attachments
|
||||
* RAG: Documentation with sqlite-vec faiss comparison
|
||||
* Getting started: Fixes to getting started notebook.
|
||||
|
||||
### Agent Evals and Model Customization
|
||||
* (**New**) Post-training: Add nemo customizer
|
||||
|
||||
### Better Engineering
|
||||
* Moved sqlite-vec to non-blocking calls
|
||||
* Don't return a payload on file delete
|
||||
|
||||
|
||||
|
||||
---
|
||||
|
||||
# v0.1.8
|
||||
Published on: 2025-03-24T01:28:50Z
|
||||
|
||||
|
|
|
@ -88,7 +88,7 @@ BRAVE_SEARCH_API_KEY=
|
|||
|
||||
And then use this dotenv file when running client SDK tests via the following:
|
||||
```bash
|
||||
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py --text-model=meta-llama/Llama-3.1-8B-Instruct
|
||||
uv run --env-file .env -- pytest -v tests/integration/inference/test_text_inference.py
|
||||
```
|
||||
|
||||
## Pre-commit Hooks
|
||||
|
@ -110,9 +110,21 @@ uv run pre-commit run --all-files
|
|||
> [!CAUTION]
|
||||
> Before pushing your changes, make sure that the pre-commit hooks have passed successfully.
|
||||
|
||||
## Running tests
|
||||
## Running unit tests
|
||||
|
||||
You can find the Llama Stack testing documentation here [here](tests/README.md).
|
||||
You can run the unit tests by running:
|
||||
|
||||
```bash
|
||||
source .venv/bin/activate
|
||||
./scripts/unit-tests.sh
|
||||
```
|
||||
|
||||
If you'd like to run for a non-default version of Python (currently 3.10), pass `PYTHON_VERSION` variable as follows:
|
||||
|
||||
```
|
||||
source .venv/bin/activate
|
||||
PYTHON_VERSION=3.13 ./scripts/unit-tests.sh
|
||||
```
|
||||
|
||||
## Adding a new dependency to the project
|
||||
|
||||
|
@ -125,20 +137,11 @@ uv sync
|
|||
|
||||
## Coding Style
|
||||
|
||||
* Comments should provide meaningful insights into the code. Avoid filler comments that simply
|
||||
describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
||||
* Prefer comments to clarify surprising behavior and/or relationships between parts of the code
|
||||
rather than explain what the next line of code does.
|
||||
* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like
|
||||
`Exception`.
|
||||
* Comments should provide meaningful insights into the code. Avoid filler comments that simply describe the next step, as they create unnecessary clutter, same goes for docstrings.
|
||||
* Prefer comments to clarify surprising behavior and/or relationships between parts of the code rather than explain what the next line of code does.
|
||||
* Catching exceptions, prefer using a specific exception type rather than a broad catch-all like `Exception`.
|
||||
* Error messages should be prefixed with "Failed to ..."
|
||||
* 4 spaces for indentation rather than tab
|
||||
* When using `# noqa` to suppress a style or linter warning, include a comment explaining the
|
||||
justification for bypassing the check.
|
||||
* When using `# type: ignore` to suppress a mypy warning, include a comment explaining the
|
||||
justification for bypassing the check.
|
||||
* Don't use unicode characters in the codebase. ASCII-only is preferred for compatibility or
|
||||
readability reasons.
|
||||
* 4 spaces for indentation rather than tabs
|
||||
|
||||
## Common Tasks
|
||||
|
||||
|
@ -167,11 +170,14 @@ If you have made changes to a provider's configuration in any form (introducing
|
|||
If you are making changes to the documentation at [https://llama-stack.readthedocs.io/en/latest/](https://llama-stack.readthedocs.io/en/latest/), you can use the following command to build the documentation and preview your changes. You will need [Sphinx](https://www.sphinx-doc.org/en/master/) and the readthedocs theme.
|
||||
|
||||
```bash
|
||||
cd docs
|
||||
uv sync --extra docs
|
||||
|
||||
# This rebuilds the documentation pages.
|
||||
uv run --group docs make -C docs/ html
|
||||
uv run make html
|
||||
|
||||
# This will start a local server (usually at http://127.0.0.1:8000) that automatically rebuilds and refreshes when you make changes to the documentation.
|
||||
uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
|
||||
uv run sphinx-autobuild source build/html --write-all
|
||||
```
|
||||
|
||||
### Update API Documentation
|
||||
|
@ -179,7 +185,7 @@ uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
|
|||
If you modify or add new API endpoints, update the API documentation accordingly. You can do this by running the following command:
|
||||
|
||||
```bash
|
||||
uv run ./docs/openapi_generator/run_openapi_generator.sh
|
||||
uv run --with ".[dev]" ./docs/openapi_generator/run_openapi_generator.sh
|
||||
```
|
||||
|
||||
The generated API documentation will be available in `docs/_static/`. Make sure to review the changes before committing.
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
include pyproject.toml
|
||||
include distributions/dependencies.json
|
||||
include llama_stack/models/llama/llama3/tokenizer.model
|
||||
include llama_stack/models/llama/llama4/tokenizer.model
|
||||
include llama_stack/distribution/*.sh
|
||||
include llama_stack/cli/scripts/*.sh
|
||||
include llama_stack/templates/*/*.yaml
|
||||
include llama_stack/providers/tests/test_cases/inference/*.json
|
||||
include llama_stack/models/llama/*/*.md
|
||||
include llama_stack/tests/integration/*.jpg
|
||||
|
|
118
README.md
118
README.md
|
@ -3,82 +3,11 @@
|
|||
[](https://pypi.org/project/llama_stack/)
|
||||
[](https://pypi.org/project/llama-stack/)
|
||||
[](https://github.com/meta-llama/llama-stack/blob/main/LICENSE)
|
||||
[](https://discord.gg/llama-stack)
|
||||
[](https://discord.gg/llama-stack)
|
||||
[](https://github.com/meta-llama/llama-stack/actions/workflows/unit-tests.yml?query=branch%3Amain)
|
||||
[](https://github.com/meta-llama/llama-stack/actions/workflows/integration-tests.yml?query=branch%3Amain)
|
||||
|
||||
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb) | [**Discord**](https://discord.gg/llama-stack)
|
||||
|
||||
### ✨🎉 Llama 4 Support 🎉✨
|
||||
We released [Version 0.2.0](https://github.com/meta-llama/llama-stack/releases/tag/v0.2.0) with support for the Llama 4 herd of models released by Meta.
|
||||
|
||||
<details>
|
||||
|
||||
<summary>👋 Click here to see how to run Llama 4 models on Llama Stack </summary>
|
||||
|
||||
\
|
||||
*Note you need 8xH100 GPU-host to run these models*
|
||||
|
||||
```bash
|
||||
pip install -U llama_stack
|
||||
|
||||
MODEL="Llama-4-Scout-17B-16E-Instruct"
|
||||
# get meta url from llama.com
|
||||
llama model download --source meta --model-id $MODEL --meta-url <META_URL>
|
||||
|
||||
# start a llama stack server
|
||||
INFERENCE_MODEL=meta-llama/$MODEL llama stack build --run --template meta-reference-gpu
|
||||
|
||||
# install client to interact with the server
|
||||
pip install llama-stack-client
|
||||
```
|
||||
### CLI
|
||||
```bash
|
||||
# Run a chat completion
|
||||
llama-stack-client --endpoint http://localhost:8321 \
|
||||
inference chat-completion \
|
||||
--model-id meta-llama/$MODEL \
|
||||
--message "write a haiku for meta's llama 4 models"
|
||||
|
||||
ChatCompletionResponse(
|
||||
completion_message=CompletionMessage(content="Whispers in code born\nLlama's gentle, wise heartbeat\nFuture's soft unfold", role='assistant', stop_reason='end_of_turn', tool_calls=[]),
|
||||
logprobs=None,
|
||||
metrics=[Metric(metric='prompt_tokens', value=21.0, unit=None), Metric(metric='completion_tokens', value=28.0, unit=None), Metric(metric='total_tokens', value=49.0, unit=None)]
|
||||
)
|
||||
```
|
||||
### Python SDK
|
||||
```python
|
||||
from llama_stack_client import LlamaStackClient
|
||||
|
||||
client = LlamaStackClient(base_url=f"http://localhost:8321")
|
||||
|
||||
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
prompt = "Write a haiku about coding"
|
||||
|
||||
print(f"User> {prompt}")
|
||||
response = client.inference.chat_completion(
|
||||
model_id=model_id,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
)
|
||||
print(f"Assistant> {response.completion_message.content}")
|
||||
```
|
||||
As more providers start supporting Llama 4, you can use them in Llama Stack as well. We are adding to the list. Stay tuned!
|
||||
|
||||
|
||||
</details>
|
||||
|
||||
### 🚀 One-Line Installer 🚀
|
||||
|
||||
To try Llama Stack locally, run:
|
||||
|
||||
```bash
|
||||
curl -LsSf https://github.com/meta-llama/llama-stack/raw/main/install.sh | sh
|
||||
```
|
||||
|
||||
### Overview
|
||||
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
|
||||
|
||||
Llama Stack standardizes the core building blocks that simplify AI application development. It codifies best practices across the Llama ecosystem. More specifically, it provides
|
||||
|
||||
|
@ -107,29 +36,25 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
|
|||
### API Providers
|
||||
Here is a list of the various API providers and available distributions that can help developers get started easily with Llama Stack.
|
||||
|
||||
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | **Post Training** |
|
||||
|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|:-----------------:|
|
||||
| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ | |
|
||||
| SambaNova | Hosted | | ✅ | | ✅ | | |
|
||||
| Cerebras | Hosted | | ✅ | | | | |
|
||||
| Fireworks | Hosted | ✅ | ✅ | ✅ | | | |
|
||||
| AWS Bedrock | Hosted | | ✅ | | ✅ | | |
|
||||
| Together | Hosted | ✅ | ✅ | | ✅ | | |
|
||||
| Groq | Hosted | | ✅ | | | | |
|
||||
| Ollama | Single Node | | ✅ | | | | |
|
||||
| TGI | Hosted and Single Node | | ✅ | | | | |
|
||||
| NVIDIA NIM | Hosted and Single Node | | ✅ | | | | |
|
||||
| Chroma | Single Node | | | ✅ | | | |
|
||||
| PG Vector | Single Node | | | ✅ | | | |
|
||||
| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | | |
|
||||
| vLLM | Hosted and Single Node | | ✅ | | | | |
|
||||
| OpenAI | Hosted | | ✅ | | | | |
|
||||
| Anthropic | Hosted | | ✅ | | | | |
|
||||
| Gemini | Hosted | | ✅ | | | | |
|
||||
| watsonx | Hosted | | ✅ | | | | |
|
||||
| HuggingFace | Single Node | | | | | | ✅ |
|
||||
| TorchTune | Single Node | | | | | | ✅ |
|
||||
| NVIDIA NEMO | Hosted | | | | | | ✅ |
|
||||
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
|
||||
|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
|
||||
| Meta Reference | Single Node | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| SambaNova | Hosted | | ✅ | | | |
|
||||
| Cerebras | Hosted | | ✅ | | | |
|
||||
| Fireworks | Hosted | ✅ | ✅ | ✅ | | |
|
||||
| AWS Bedrock | Hosted | | ✅ | | ✅ | |
|
||||
| Together | Hosted | ✅ | ✅ | | ✅ | |
|
||||
| Groq | Hosted | | ✅ | | | |
|
||||
| Ollama | Single Node | | ✅ | | | |
|
||||
| TGI | Hosted and Single Node | | ✅ | | | |
|
||||
| NVIDIA NIM | Hosted and Single Node | | ✅ | | | |
|
||||
| Chroma | Single Node | | | ✅ | | |
|
||||
| PG Vector | Single Node | | | ✅ | | |
|
||||
| PyTorch ExecuTorch | On-device iOS | ✅ | ✅ | | | |
|
||||
| vLLM | Hosted and Single Node | | ✅ | | | |
|
||||
| OpenAI | Hosted | | ✅ | | | |
|
||||
| Anthropic | Hosted | | ✅ | | | |
|
||||
| Gemini | Hosted | | ✅ | | | |
|
||||
|
||||
|
||||
### Distributions
|
||||
|
@ -139,6 +64,7 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider
|
|||
| **Distribution** | **Llama Stack Docker** | Start This Distribution |
|
||||
|:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
|
||||
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) |
|
||||
| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
|
||||
| SambaNova | [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html) |
|
||||
| Cerebras | [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html) |
|
||||
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html) |
|
||||
|
|
1
distributions/bedrock/build.yaml
Symbolic link
1
distributions/bedrock/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/bedrock/build.yaml
|
15
distributions/bedrock/compose.yaml
Normal file
15
distributions/bedrock/compose.yaml
Normal file
|
@ -0,0 +1,15 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: distribution-bedrock
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run.yaml:/root/llamastack-run-bedrock.yaml
|
||||
ports:
|
||||
- "8321:8321"
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
1
distributions/bedrock/run.yaml
Symbolic link
1
distributions/bedrock/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/bedrock/run.yaml
|
1
distributions/cerebras/build.yaml
Symbolic link
1
distributions/cerebras/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/cerebras/build.yaml
|
16
distributions/cerebras/compose.yaml
Normal file
16
distributions/cerebras/compose.yaml
Normal file
|
@ -0,0 +1,16 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: llamastack/distribution-cerebras
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run.yaml:/root/llamastack-run-cerebras.yaml
|
||||
ports:
|
||||
- "8321:8321"
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
1
distributions/cerebras/run.yaml
Symbolic link
1
distributions/cerebras/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/cerebras/run.yaml
|
50
distributions/dell-tgi/compose.yaml
Normal file
50
distributions/dell-tgi/compose.yaml
Normal file
|
@ -0,0 +1,50 @@
|
|||
services:
|
||||
text-generation-inference:
|
||||
image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
ports:
|
||||
- "5009:5009"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0,1,2,3,4
|
||||
- NUM_SHARD=4
|
||||
- MAX_BATCH_PREFILL_TOKENS=32768
|
||||
- MAX_INPUT_TOKENS=8000
|
||||
- MAX_TOTAL_TOKENS=8192
|
||||
command: []
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# that's the closest analogue to --gpus; provide
|
||||
# an integer amount of devices or 'all'
|
||||
count: all
|
||||
# Devices are reserved using a list of capabilities, making
|
||||
# capabilities the only required field. A device MUST
|
||||
# satisfy all the requested capabilities for a successful
|
||||
# reservation.
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
llamastack:
|
||||
depends_on:
|
||||
text-generation-inference:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
# Link to TGI run.yaml file
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "8321:8321"
|
||||
# Hack: wait for TGI server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
44
distributions/dell-tgi/run.yaml
Normal file
44
distributions/dell-tgi/run.yaml
Normal file
|
@ -0,0 +1,44 @@
|
|||
version: '2'
|
||||
image_name: local
|
||||
container_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: tgi0
|
||||
provider_type: remote::tgi
|
||||
config:
|
||||
url: http://127.0.0.1:80
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::faiss
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
|
@ -1,23 +1,19 @@
|
|||
{
|
||||
"bedrock": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"boto3",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -25,36 +21,29 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn"
|
||||
],
|
||||
"cerebras": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"cerebras_cloud_sdk",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -62,38 +51,31 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"ci-tests": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"fireworks-ai",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -101,17 +83,14 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"sqlite-vec",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
|
@ -119,22 +98,18 @@
|
|||
"dell": [
|
||||
"aiohttp",
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"huggingface_hub",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -142,39 +117,66 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"dev": [
|
||||
"aiosqlite",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"fireworks-ai",
|
||||
"httpx",
|
||||
"litellm",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlite-vec",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"fireworks": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"fireworks-ai",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -182,37 +184,30 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"groq": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"litellm",
|
||||
"matplotlib",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -220,38 +215,31 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn"
|
||||
],
|
||||
"hf-endpoint": [
|
||||
"aiohttp",
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"huggingface_hub",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -259,38 +247,31 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn"
|
||||
],
|
||||
"hf-serverless": [
|
||||
"aiohttp",
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"huggingface_hub",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -298,95 +279,13 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"kvant": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"llama_api": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"litellm",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"sqlite-vec",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
|
@ -394,25 +293,20 @@
|
|||
"meta-reference-gpu": [
|
||||
"accelerate",
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"fairscale",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fbgemm-gpu-genai==1.1.2",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"lm-format-enforcer",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -420,20 +314,55 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentence-transformers",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"torch",
|
||||
"torchao==0.8.0",
|
||||
"torchvision",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"zmq"
|
||||
],
|
||||
"meta-reference-quantized-gpu": [
|
||||
"accelerate",
|
||||
"aiosqlite",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"fairscale",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fbgemm-gpu",
|
||||
"fire",
|
||||
"httpx",
|
||||
"lm-format-enforcer",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentence-transformers",
|
||||
"sentencepiece",
|
||||
"torch",
|
||||
"torchao==0.5.0",
|
||||
"torchvision",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"uvicorn",
|
||||
"zmq"
|
||||
],
|
||||
|
@ -442,7 +371,6 @@
|
|||
"aiosqlite",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"datasets",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
|
@ -463,7 +391,6 @@
|
|||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"uvicorn"
|
||||
|
@ -471,63 +398,19 @@
|
|||
"ollama": [
|
||||
"aiohttp",
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"ollama",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"peft",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"torch",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"trl",
|
||||
"uvicorn"
|
||||
],
|
||||
"open-benchmark": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"litellm",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -535,38 +418,29 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"sqlite-vec",
|
||||
"together",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn"
|
||||
],
|
||||
"passthrough": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -574,33 +448,27 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"remote-vllm": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
|
@ -613,16 +481,13 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
|
@ -636,46 +501,7 @@
|
|||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"litellm",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"starter": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"fireworks-ai",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"litellm",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
|
@ -686,41 +512,31 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"sqlite-vec",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
"uvicorn"
|
||||
],
|
||||
"tgi": [
|
||||
"aiohttp",
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"huggingface_hub",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -728,38 +544,31 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"together": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -767,79 +576,32 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"together",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"verification": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"litellm",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"sqlite-vec",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"vllm-gpu": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"chromadb-client",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
|
@ -847,58 +609,16 @@
|
|||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"vllm",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
],
|
||||
"watsonx": [
|
||||
"aiosqlite",
|
||||
"autoevals",
|
||||
"blobfile",
|
||||
"chardet",
|
||||
"datasets",
|
||||
"emoji",
|
||||
"faiss-cpu",
|
||||
"fastapi",
|
||||
"fire",
|
||||
"httpx",
|
||||
"ibm_watson_machine_learning",
|
||||
"langdetect",
|
||||
"matplotlib",
|
||||
"mcp",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai",
|
||||
"opentelemetry-exporter-otlp-proto-http",
|
||||
"opentelemetry-sdk",
|
||||
"pandas",
|
||||
"pillow",
|
||||
"psycopg2-binary",
|
||||
"pymongo",
|
||||
"pypdf",
|
||||
"pythainlp",
|
||||
"redis",
|
||||
"requests",
|
||||
"scikit-learn",
|
||||
"scipy",
|
||||
"sentencepiece",
|
||||
"sqlalchemy[asyncio]",
|
||||
"tqdm",
|
||||
"transformers",
|
||||
"tree_sitter",
|
||||
"uvicorn",
|
||||
"sentence-transformers --no-deps",
|
||||
"torch torchvision --index-url https://download.pytorch.org/whl/cpu"
|
||||
]
|
||||
}
|
1
distributions/fireworks/build.yaml
Symbolic link
1
distributions/fireworks/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/fireworks/build.yaml
|
14
distributions/fireworks/compose.yaml
Normal file
14
distributions/fireworks/compose.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: llamastack/distribution-fireworks
|
||||
ports:
|
||||
- "8321:8321"
|
||||
environment:
|
||||
- FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
1
distributions/fireworks/run.yaml
Symbolic link
1
distributions/fireworks/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/fireworks/run.yaml
|
1
distributions/meta-reference-gpu/build.yaml
Symbolic link
1
distributions/meta-reference-gpu/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/meta-reference-gpu/build.yaml
|
34
distributions/meta-reference-gpu/compose.yaml
Normal file
34
distributions/meta-reference-gpu/compose.yaml
Normal file
|
@ -0,0 +1,34 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: llamastack/distribution-meta-reference-gpu
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "8321:8321"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
command: []
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# that's the closest analogue to --gpus; provide
|
||||
# an integer amount of devices or 'all'
|
||||
count: 1
|
||||
# Devices are reserved using a list of capabilities, making
|
||||
# capabilities the only required field. A device MUST
|
||||
# satisfy all the requested capabilities for a successful
|
||||
# reservation.
|
||||
capabilities: [gpu]
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
||||
runtime: nvidia
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
1
distributions/meta-reference-gpu/run-with-safety.yaml
Symbolic link
1
distributions/meta-reference-gpu/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
|
1
distributions/meta-reference-gpu/run.yaml
Symbolic link
1
distributions/meta-reference-gpu/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/meta-reference-gpu/run.yaml
|
1
distributions/meta-reference-quantized-gpu/build.yaml
Symbolic link
1
distributions/meta-reference-quantized-gpu/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
|
35
distributions/meta-reference-quantized-gpu/compose.yaml
Normal file
35
distributions/meta-reference-quantized-gpu/compose.yaml
Normal file
|
@ -0,0 +1,35 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: llamastack/distribution-meta-reference-quantized-gpu
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "8321:8321"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
command: []
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# that's the closest analogue to --gpus; provide
|
||||
# an integer amount of devices or 'all'
|
||||
count: 1
|
||||
# Devices are reserved using a list of capabilities, making
|
||||
# capabilities the only required field. A device MUST
|
||||
# satisfy all the requested capabilities for a successful
|
||||
# reservation.
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
58
distributions/meta-reference-quantized-gpu/run.yaml
Normal file
58
distributions/meta-reference-quantized-gpu/run.yaml
Normal file
|
@ -0,0 +1,58 @@
|
|||
version: '2'
|
||||
image_name: local
|
||||
container_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference-quantized
|
||||
config:
|
||||
model: Llama3.2-3B-Instruct:int4-qlora-eo8
|
||||
quantization:
|
||||
type: int4
|
||||
torch_seed: null
|
||||
max_seq_len: 2048
|
||||
max_batch_size: 1
|
||||
- provider_id: meta1
|
||||
provider_type: inline::meta-reference-quantized
|
||||
config:
|
||||
# not a quantized model !
|
||||
model: Llama-Guard-3-1B
|
||||
quantization: null
|
||||
torch_seed: null
|
||||
max_seq_len: 2048
|
||||
max_batch_size: 1
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
1
distributions/ollama/build.yaml
Symbolic link
1
distributions/ollama/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/ollama/build.yaml
|
71
distributions/ollama/compose.yaml
Normal file
71
distributions/ollama/compose.yaml
Normal file
|
@ -0,0 +1,71 @@
|
|||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
network_mode: ${NETWORK_MODE:-bridge}
|
||||
volumes:
|
||||
- ~/.ollama:/root/.ollama
|
||||
ports:
|
||||
- "11434:11434"
|
||||
environment:
|
||||
OLLAMA_DEBUG: 1
|
||||
command: []
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G # Set maximum memory
|
||||
reservations:
|
||||
memory: 8G # Set minimum memory reservation
|
||||
# healthcheck:
|
||||
# # ugh, no CURL in ollama image
|
||||
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
|
||||
# interval: 10s
|
||||
# timeout: 5s
|
||||
# retries: 5
|
||||
|
||||
ollama-init:
|
||||
image: ollama/ollama:latest
|
||||
depends_on:
|
||||
- ollama
|
||||
# condition: service_healthy
|
||||
network_mode: ${NETWORK_MODE:-bridge}
|
||||
environment:
|
||||
- OLLAMA_HOST=ollama
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
||||
volumes:
|
||||
- ~/.ollama:/root/.ollama
|
||||
- ./pull-models.sh:/pull-models.sh
|
||||
entrypoint: ["/pull-models.sh"]
|
||||
|
||||
llamastack:
|
||||
depends_on:
|
||||
ollama:
|
||||
condition: service_started
|
||||
ollama-init:
|
||||
condition: service_started
|
||||
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
|
||||
network_mode: ${NETWORK_MODE:-bridge}
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
# Link to ollama run.yaml file
|
||||
- ~/local/llama-stack/:/app/llama-stack-source
|
||||
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
||||
environment:
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
||||
- OLLAMA_URL=http://ollama:11434
|
||||
entrypoint: >
|
||||
python -m llama_stack.distribution.server.server /root/my-run.yaml \
|
||||
--port ${LLAMA_STACK_PORT:-8321}
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 10s
|
||||
max_attempts: 3
|
||||
window: 60s
|
||||
volumes:
|
||||
ollama:
|
||||
ollama-init:
|
||||
llamastack:
|
18
distributions/ollama/pull-models.sh
Executable file
18
distributions/ollama/pull-models.sh
Executable file
|
@ -0,0 +1,18 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
|
||||
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
|
||||
echo "Preloading $model..."
|
||||
if ! ollama run "$model"; then
|
||||
echo "Failed to pull and run $model"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo "All models pulled successfully"
|
1
distributions/ollama/run-with-safety.yaml
Symbolic link
1
distributions/ollama/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/ollama/run-with-safety.yaml
|
1
distributions/ollama/run.yaml
Symbolic link
1
distributions/ollama/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/ollama/run.yaml
|
BIN
distributions/ramalama/faiss_store.db
Normal file
BIN
distributions/ramalama/faiss_store.db
Normal file
Binary file not shown.
1
distributions/remote-nvidia/build.yaml
Symbolic link
1
distributions/remote-nvidia/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/nvidia/build.yaml
|
19
distributions/remote-nvidia/compose.yaml
Normal file
19
distributions/remote-nvidia/compose.yaml
Normal file
|
@ -0,0 +1,19 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: distribution-nvidia:dev
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run.yaml:/root/llamastack-run-nvidia.yaml
|
||||
ports:
|
||||
- "8321:8321"
|
||||
environment:
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
||||
- NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
1
distributions/remote-nvidia/run.yaml
Symbolic link
1
distributions/remote-nvidia/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/nvidia/run.yaml
|
1
distributions/remote-vllm/build.yaml
Symbolic link
1
distributions/remote-vllm/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/remote-vllm/build.yaml
|
99
distributions/remote-vllm/compose.yaml
Normal file
99
distributions/remote-vllm/compose.yaml
Normal file
|
@ -0,0 +1,99 @@
|
|||
services:
|
||||
vllm-inference:
|
||||
image: vllm/vllm-openai:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port ${VLLM_INFERENCE_PORT:-5100}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
# A little trick:
|
||||
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
||||
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
||||
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||
image: vllm/vllm-openai:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model ${VLLM_SAFETY_MODEL}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port ${VLLM_SAFETY_PORT:-5101}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
llamastack:
|
||||
depends_on:
|
||||
- vllm-inference:
|
||||
condition: service_healthy
|
||||
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
environment:
|
||||
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
||||
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
||||
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
ports:
|
||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
||||
# Hack: wait for vLLM server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
||||
volumes:
|
||||
vllm-inference:
|
||||
vllm-safety:
|
||||
llamastack:
|
1
distributions/remote-vllm/run-with-safety.yaml
Symbolic link
1
distributions/remote-vllm/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/remote-vllm/run-with-safety.yaml
|
1
distributions/remote-vllm/run.yaml
Symbolic link
1
distributions/remote-vllm/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/remote-vllm/run.yaml
|
9
distributions/runpod/build.yaml
Normal file
9
distributions/runpod/build.yaml
Normal file
|
@ -0,0 +1,9 @@
|
|||
name: runpod
|
||||
distribution_spec:
|
||||
description: Use Runpod for running LLM inference
|
||||
providers:
|
||||
inference: remote::runpod
|
||||
memory: meta-reference
|
||||
safety: meta-reference
|
||||
agents: meta-reference
|
||||
telemetry: meta-reference
|
1
distributions/sambanova/build.yaml
Normal file
1
distributions/sambanova/build.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/sambanova/build.yaml
|
16
distributions/sambanova/compose.yaml
Normal file
16
distributions/sambanova/compose.yaml
Normal file
|
@ -0,0 +1,16 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: llamastack/distribution-sambanova
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run.yaml:/root/llamastack-run-sambanova.yaml
|
||||
ports:
|
||||
- "5000:5000"
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
1
distributions/sambanova/run.yaml
Normal file
1
distributions/sambanova/run.yaml
Normal file
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/sambanova/run.yaml
|
1
distributions/tgi/build.yaml
Symbolic link
1
distributions/tgi/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/tgi/build.yaml
|
103
distributions/tgi/compose.yaml
Normal file
103
distributions/tgi/compose.yaml
Normal file
|
@ -0,0 +1,103 @@
|
|||
services:
|
||||
tgi-inference:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--port ${TGI_INFERENCE_PORT:-8080}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
--port ${TGI_SAFETY_PORT:-8081}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
llamastack:
|
||||
depends_on:
|
||||
tgi-inference:
|
||||
condition: service_healthy
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi:test-0.0.52rc3
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
||||
# Hack: wait for TGI server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
||||
environment:
|
||||
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
||||
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
|
||||
volumes:
|
||||
tgi-inference:
|
||||
tgi-safety:
|
||||
llamastack:
|
1
distributions/tgi/run-with-safety.yaml
Symbolic link
1
distributions/tgi/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/tgi/run-with-safety.yaml
|
1
distributions/tgi/run.yaml
Symbolic link
1
distributions/tgi/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/tgi/run.yaml
|
1
distributions/together/build.yaml
Symbolic link
1
distributions/together/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/together/build.yaml
|
14
distributions/together/compose.yaml
Normal file
14
distributions/together/compose.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: llamastack/distribution-together
|
||||
ports:
|
||||
- "8321:8321"
|
||||
environment:
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY}
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
1
distributions/together/run.yaml
Symbolic link
1
distributions/together/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/together/run.yaml
|
1
distributions/vllm-gpu/build.yaml
Symbolic link
1
distributions/vllm-gpu/build.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/inline-vllm/build.yaml
|
35
distributions/vllm-gpu/compose.yaml
Normal file
35
distributions/vllm-gpu/compose.yaml
Normal file
|
@ -0,0 +1,35 @@
|
|||
services:
|
||||
llamastack:
|
||||
image: llamastack/distribution-inline-vllm
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "8321:8321"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
command: []
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# that's the closest analogue to --gpus; provide
|
||||
# an integer amount of devices or 'all'
|
||||
count: 1
|
||||
# Devices are reserved using a list of capabilities, making
|
||||
# capabilities the only required field. A device MUST
|
||||
# satisfy all the requested capabilities for a successful
|
||||
# reservation.
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
66
distributions/vllm-gpu/run.yaml
Normal file
66
distributions/vllm-gpu/run.yaml
Normal file
|
@ -0,0 +1,66 @@
|
|||
version: '2'
|
||||
image_name: local
|
||||
container_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: vllm-inference
|
||||
provider_type: inline::vllm
|
||||
config:
|
||||
model: Llama3.2-3B-Instruct
|
||||
tensor_parallel_size: 1
|
||||
gpu_memory_utilization: 0.4
|
||||
enforce_eager: true
|
||||
max_tokens: 4096
|
||||
- provider_id: vllm-inference-safety
|
||||
provider_type: inline::vllm
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
tensor_parallel_size: 1
|
||||
gpu_memory_utilization: 0.2
|
||||
enforce_eager: true
|
||||
max_tokens: 4096
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
# Uncomment to use prompt guard
|
||||
# - provider_id: meta1
|
||||
# provider_type: inline::prompt-guard
|
||||
# config:
|
||||
# model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
# Uncomment to use pgvector
|
||||
# - provider_id: pgvector
|
||||
# provider_type: remote::pgvector
|
||||
# config:
|
||||
# host: 127.0.0.1
|
||||
# port: 5432
|
||||
# db: postgres
|
||||
# user: postgres
|
||||
# password: mysecretpassword
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/agents_store.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
17
docs/_static/css/my_theme.css
vendored
17
docs/_static/css/my_theme.css
vendored
|
@ -16,20 +16,3 @@
|
|||
.hide-title h1 {
|
||||
display: none;
|
||||
}
|
||||
|
||||
h2, h3, h4 {
|
||||
font-weight: normal;
|
||||
}
|
||||
html[data-theme="dark"] .rst-content div[class^="highlight"] {
|
||||
background-color: #0b0b0b;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap !important;
|
||||
word-break: break-all;
|
||||
}
|
||||
|
||||
[data-theme="dark"] .mermaid {
|
||||
background-color: #f4f4f6 !important;
|
||||
border-radius: 6px;
|
||||
padding: 0.5em;
|
||||
}
|
||||
|
|
32
docs/_static/js/detect_theme.js
vendored
32
docs/_static/js/detect_theme.js
vendored
|
@ -1,32 +0,0 @@
|
|||
document.addEventListener("DOMContentLoaded", function () {
|
||||
const prefersDark = window.matchMedia("(prefers-color-scheme: dark)").matches;
|
||||
const htmlElement = document.documentElement;
|
||||
|
||||
// Check if theme is saved in localStorage
|
||||
const savedTheme = localStorage.getItem("sphinx-rtd-theme");
|
||||
|
||||
if (savedTheme) {
|
||||
// Use the saved theme preference
|
||||
htmlElement.setAttribute("data-theme", savedTheme);
|
||||
document.body.classList.toggle("dark", savedTheme === "dark");
|
||||
} else {
|
||||
// Fall back to system preference
|
||||
const theme = prefersDark ? "dark" : "light";
|
||||
htmlElement.setAttribute("data-theme", theme);
|
||||
document.body.classList.toggle("dark", theme === "dark");
|
||||
// Save initial preference
|
||||
localStorage.setItem("sphinx-rtd-theme", theme);
|
||||
}
|
||||
|
||||
// Listen for theme changes from the existing toggle
|
||||
const observer = new MutationObserver(function(mutations) {
|
||||
mutations.forEach(function(mutation) {
|
||||
if (mutation.attributeName === "data-theme") {
|
||||
const currentTheme = htmlElement.getAttribute("data-theme");
|
||||
localStorage.setItem("sphinx-rtd-theme", currentTheme);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
observer.observe(htmlElement, { attributes: true });
|
||||
});
|
6486
docs/_static/llama-stack-spec.html
vendored
6486
docs/_static/llama-stack-spec.html
vendored
File diff suppressed because it is too large
Load diff
4765
docs/_static/llama-stack-spec.yaml
vendored
4765
docs/_static/llama-stack-spec.yaml
vendored
File diff suppressed because it is too large
Load diff
Binary file not shown.
Before Width: | Height: | Size: 33 KiB |
Binary file not shown.
Before Width: | Height: | Size: 37 KiB |
Binary file not shown.
Before Width: | Height: | Size: 56 KiB |
File diff suppressed because it is too large
Load diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load diff
|
@ -840,6 +840,7 @@
|
|||
" \"memory_optimizations.rst\",\n",
|
||||
" \"chat.rst\",\n",
|
||||
" \"llama3.rst\",\n",
|
||||
" \"datasets.rst\",\n",
|
||||
" \"qat_finetune.rst\",\n",
|
||||
" \"lora_finetune.rst\",\n",
|
||||
"]\n",
|
||||
|
@ -1585,6 +1586,7 @@
|
|||
" \"memory_optimizations.rst\",\n",
|
||||
" \"chat.rst\",\n",
|
||||
" \"llama3.rst\",\n",
|
||||
" \"datasets.rst\",\n",
|
||||
" \"qat_finetune.rst\",\n",
|
||||
" \"lora_finetune.rst\",\n",
|
||||
"]\n",
|
||||
|
|
|
@ -44,14 +44,13 @@ def main(output_dir: str):
|
|||
if return_type_errors:
|
||||
print("\nAPI Method Return Type Validation Errors:\n")
|
||||
for error in return_type_errors:
|
||||
print(error, file=sys.stderr)
|
||||
print(error)
|
||||
sys.exit(1)
|
||||
now = str(datetime.now())
|
||||
print(
|
||||
"Converting the spec to YAML (openapi.yaml) and HTML (openapi.html) at " + now
|
||||
)
|
||||
print("")
|
||||
|
||||
spec = Specification(
|
||||
LlamaStack,
|
||||
Options(
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
|
||||
import hashlib
|
||||
import ipaddress
|
||||
import types
|
||||
import typing
|
||||
from dataclasses import make_dataclass
|
||||
from typing import Any, Dict, Set, Union
|
||||
|
@ -180,7 +179,7 @@ class ContentBuilder:
|
|||
"Creates the content subtree for a request or response."
|
||||
|
||||
def is_iterator_type(t):
|
||||
return "StreamChunk" in str(t) or "OpenAIResponseObjectStream" in str(t)
|
||||
return "StreamChunk" in str(t)
|
||||
|
||||
def get_media_type(t):
|
||||
if is_generic_list(t):
|
||||
|
@ -190,7 +189,7 @@ class ContentBuilder:
|
|||
else:
|
||||
return "application/json"
|
||||
|
||||
if typing.get_origin(payload_type) in (typing.Union, types.UnionType):
|
||||
if typing.get_origin(payload_type) is typing.Union:
|
||||
media_types = []
|
||||
item_types = []
|
||||
for x in typing.get_args(payload_type):
|
||||
|
@ -520,7 +519,7 @@ class Generator:
|
|||
)
|
||||
|
||||
def _build_extra_tag_groups(
|
||||
self, extra_types: Dict[str, Dict[str, type]]
|
||||
self, extra_types: Dict[str, List[type]]
|
||||
) -> Dict[str, List[Tag]]:
|
||||
"""
|
||||
Creates a dictionary of tag group captions as keys, and tag lists as values.
|
||||
|
@ -533,8 +532,9 @@ class Generator:
|
|||
for category_name, category_items in extra_types.items():
|
||||
tag_list: List[Tag] = []
|
||||
|
||||
for name, extra_type in category_items.items():
|
||||
schema = self.schema_builder.classdef_to_schema(extra_type)
|
||||
for extra_type in category_items:
|
||||
name = python_type_to_name(extra_type)
|
||||
schema = self.schema_builder.classdef_to_named_schema(name, extra_type)
|
||||
tag_list.append(self._build_type_tag(name, schema))
|
||||
|
||||
if tag_list:
|
||||
|
@ -759,7 +759,7 @@ class Generator:
|
|||
)
|
||||
|
||||
return Operation(
|
||||
tags=[getattr(op.defining_class, "API_NAMESPACE", op.defining_class.__name__)],
|
||||
tags=[op.defining_class.__name__],
|
||||
summary=None,
|
||||
# summary=doc_string.short_description,
|
||||
description=description,
|
||||
|
@ -805,8 +805,6 @@ class Generator:
|
|||
operation_tags: List[Tag] = []
|
||||
for cls in endpoint_classes:
|
||||
doc_string = parse_type(cls)
|
||||
if hasattr(cls, "API_NAMESPACE") and cls.API_NAMESPACE != cls.__name__:
|
||||
continue
|
||||
operation_tags.append(
|
||||
Tag(
|
||||
name=cls.__name__,
|
||||
|
@ -865,7 +863,7 @@ class Generator:
|
|||
for caption, extra_tag_group in extra_tag_groups.items():
|
||||
tag_groups.append(
|
||||
TagGroup(
|
||||
name=caption,
|
||||
name=self.options.map(caption),
|
||||
tags=sorted(tag.name for tag in extra_tag_group),
|
||||
)
|
||||
)
|
||||
|
|
|
@ -132,18 +132,7 @@ def _validate_api_method_return_type(method) -> str | None:
|
|||
|
||||
return_type = hints['return']
|
||||
if is_optional_type(return_type):
|
||||
return "returns Optional type where a return value is mandatory"
|
||||
|
||||
|
||||
def _validate_api_method_doesnt_return_list(method) -> str | None:
|
||||
hints = get_type_hints(method)
|
||||
|
||||
if 'return' not in hints:
|
||||
return "has no return type annotation"
|
||||
|
||||
return_type = hints['return']
|
||||
if get_origin(return_type) is list:
|
||||
return "returns a list where a PaginatedResponse or List*Response object is expected"
|
||||
return "returns Optional type"
|
||||
|
||||
|
||||
def _validate_api_delete_method_returns_none(method) -> str | None:
|
||||
|
@ -154,84 +143,15 @@ def _validate_api_delete_method_returns_none(method) -> str | None:
|
|||
|
||||
return_type = hints['return']
|
||||
if return_type is not None and return_type is not type(None):
|
||||
return "does not return None where None is mandatory"
|
||||
return "does not return None"
|
||||
|
||||
|
||||
def _validate_list_parameters_contain_data(method) -> str | None:
|
||||
hints = get_type_hints(method)
|
||||
|
||||
if 'return' not in hints:
|
||||
return "has no return type annotation"
|
||||
|
||||
return_type = hints['return']
|
||||
if not inspect.isclass(return_type):
|
||||
return
|
||||
|
||||
if not return_type.__name__.startswith('List'):
|
||||
return
|
||||
|
||||
if 'data' not in return_type.model_fields:
|
||||
return "does not have a mandatory data attribute containing the list of objects"
|
||||
|
||||
|
||||
def _validate_has_ellipsis(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
if "..." not in source and not "NotImplementedError" in source:
|
||||
return "does not contain ellipsis (...) in its implementation"
|
||||
|
||||
def _validate_has_return_in_docstring(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
return_type = method.__annotations__.get('return')
|
||||
if return_type is not None and return_type != type(None) and ":returns:" not in source:
|
||||
return "does not have a ':returns:' in its docstring"
|
||||
|
||||
def _validate_has_params_in_docstring(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
sig = inspect.signature(method)
|
||||
# Only check if the method has more than one parameter
|
||||
if len(sig.parameters) > 1 and ":param" not in source:
|
||||
return "does not have a ':param' in its docstring"
|
||||
|
||||
def _validate_has_no_return_none_in_docstring(method) -> str | None:
|
||||
source = inspect.getsource(method)
|
||||
return_type = method.__annotations__.get('return')
|
||||
if return_type is None and ":returns: None" in source:
|
||||
return "has a ':returns: None' in its docstring which is redundant for None-returning functions"
|
||||
|
||||
def _validate_docstring_lines_end_with_dot(method) -> str | None:
|
||||
docstring = inspect.getdoc(method)
|
||||
if docstring is None:
|
||||
return None
|
||||
|
||||
lines = docstring.split('\n')
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and not any(line.endswith(char) for char in '.:{}[]()",'):
|
||||
return f"docstring line '{line}' does not end with a valid character: . : {{ }} [ ] ( ) , \""
|
||||
|
||||
_VALIDATORS = {
|
||||
"GET": [
|
||||
_validate_api_method_return_type,
|
||||
_validate_list_parameters_contain_data,
|
||||
_validate_api_method_doesnt_return_list,
|
||||
_validate_has_ellipsis,
|
||||
_validate_has_return_in_docstring,
|
||||
_validate_has_params_in_docstring,
|
||||
_validate_docstring_lines_end_with_dot,
|
||||
],
|
||||
"DELETE": [
|
||||
_validate_api_delete_method_returns_none,
|
||||
_validate_has_ellipsis,
|
||||
_validate_has_return_in_docstring,
|
||||
_validate_has_params_in_docstring,
|
||||
_validate_has_no_return_none_in_docstring
|
||||
],
|
||||
"POST": [
|
||||
_validate_has_ellipsis,
|
||||
_validate_has_return_in_docstring,
|
||||
_validate_has_params_in_docstring,
|
||||
_validate_has_no_return_none_in_docstring,
|
||||
_validate_docstring_lines_end_with_dot,
|
||||
],
|
||||
}
|
||||
|
||||
|
|
|
@ -2,14 +2,6 @@
|
|||
|
||||
Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
|
||||
|
||||
## Render locally
|
||||
|
||||
From the llama-stack root directory, run the following command to render the docs locally:
|
||||
```bash
|
||||
uv run --group docs sphinx-autobuild docs/source docs/build/html --write-all
|
||||
```
|
||||
You can open up the docs in your browser at http://localhost:8000
|
||||
|
||||
## Content
|
||||
|
||||
Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
|
||||
|
|
14
docs/requirements.txt
Normal file
14
docs/requirements.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
sphinx==8.1.3
|
||||
myst-parser
|
||||
linkify
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
|
||||
sphinx-rtd-theme>=1.0.0
|
||||
sphinx-pdj-theme
|
||||
sphinx-copybutton
|
||||
sphinx-tabs
|
||||
sphinx-design
|
||||
sphinxcontrib-openapi
|
||||
sphinxcontrib-redoc
|
||||
sphinxcontrib-mermaid
|
||||
sphinxcontrib-video
|
||||
tomli
|
|
@ -1,9 +1,6 @@
|
|||
# Agents
|
||||
# Llama Stack Agent Framework
|
||||
|
||||
An Agent in Llama Stack is a powerful abstraction that allows you to build complex AI applications.
|
||||
|
||||
The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI
|
||||
applications. This document explains the key components and how they work together.
|
||||
The Llama Stack agent framework is built on a modular architecture that allows for flexible and powerful AI applications. This document explains the key components and how they work together.
|
||||
|
||||
## Core Concepts
|
||||
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
## Agent Execution Loop
|
||||
|
||||
Agents are the heart of Llama Stack applications. They combine inference, memory, safety, and tool usage into coherent
|
||||
workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage,
|
||||
and safety checks.
|
||||
|
||||
### Steps in the Agent Workflow
|
||||
Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
|
||||
|
||||
Each agent turn follows these key steps:
|
||||
|
||||
|
@ -68,10 +64,7 @@ sequenceDiagram
|
|||
S->>U: 5. Final Response
|
||||
```
|
||||
|
||||
Each step in this process can be monitored and controlled through configurations.
|
||||
|
||||
### Agent Execution Loop Example
|
||||
Here's an example that demonstrates monitoring the agent's execution:
|
||||
Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
|
||||
|
||||
```python
|
||||
from llama_stack_client import LlamaStackClient, Agent, AgentEventLogger
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Building AI Applications (Examples)
|
||||
# Building AI Applications
|
||||
|
||||
Llama Stack provides all the building blocks needed to create sophisticated AI applications.
|
||||
|
||||
|
@ -8,9 +8,9 @@ The best way to get started is to look at this notebook which walks through the
|
|||
|
||||
Here are some key topics that will help you build effective agents:
|
||||
|
||||
- **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
|
||||
- **[Agent](agent)**: Understand the components and design patterns of the Llama Stack agent framework.
|
||||
- **[Agent Execution Loop](agent_execution_loop)**: Understand how agents process information, make decisions, and execute actions in a continuous loop.
|
||||
- **[RAG (Retrieval-Augmented Generation)](rag)**: Learn how to enhance your agents with external knowledge through retrieval mechanisms.
|
||||
- **[Tools](tools)**: Extend your agents' capabilities by integrating with external tools and APIs.
|
||||
- **[Evals](evals)**: Evaluate your agents' effectiveness and identify areas for improvement.
|
||||
- **[Telemetry](telemetry)**: Monitor and analyze your agents' performance and behavior.
|
||||
|
@ -20,11 +20,12 @@ Here are some key topics that will help you build effective agents:
|
|||
:hidden:
|
||||
:maxdepth: 1
|
||||
|
||||
rag
|
||||
agent
|
||||
agent_execution_loop
|
||||
rag
|
||||
tools
|
||||
evals
|
||||
telemetry
|
||||
evals
|
||||
advanced_agent_patterns
|
||||
safety
|
||||
```
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
## Retrieval Augmented Generation (RAG)
|
||||
## Using Retrieval Augmented Generation (RAG)
|
||||
|
||||
RAG enables your applications to reference and recall information from previous interactions or external documents.
|
||||
|
||||
Llama Stack organizes the APIs that enable RAG into three layers:
|
||||
1. The lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.).
|
||||
2. The next is the "Rag Tool", a first-class tool as part of the [Tools API](tools.md) that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
|
||||
3. Finally, it all comes together with the top-level ["Agents" API](agent.md) that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
|
||||
- the lowermost APIs deal with raw storage and retrieval. These include Vector IO, KeyValue IO (coming soon) and Relational IO (also coming soon.)
|
||||
- next is the "Rag Tool", a first-class tool as part of the Tools API that allows you to ingest documents (from URLs, files, etc) with various chunking strategies and query them smartly.
|
||||
- finally, it all comes together with the top-level "Agents" API that allows you to create agents that can use the tools to answer questions, perform tasks, and more.
|
||||
|
||||
<img src="rag.png" alt="RAG System" width="50%">
|
||||
|
||||
|
@ -17,19 +17,14 @@ We may add more storage types like Graph IO in the future.
|
|||
|
||||
### Setting up Vector DBs
|
||||
|
||||
For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
|
||||
Ollama is an LLM runtime that allows you to run Llama models locally.
|
||||
|
||||
Here's how to set up a vector database for RAG:
|
||||
|
||||
```python
|
||||
# Create http client
|
||||
import os
|
||||
from llama_stack_client import LlamaStackClient
|
||||
|
||||
client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
|
||||
|
||||
|
||||
# Register a vector db
|
||||
vector_db_id = "my_documents"
|
||||
response = client.vector_dbs.register(
|
||||
|
@ -38,53 +33,17 @@ response = client.vector_dbs.register(
|
|||
embedding_dimension=384,
|
||||
provider_id="faiss",
|
||||
)
|
||||
```
|
||||
|
||||
### Ingesting Documents
|
||||
You can ingest documents into the vector database using two methods: directly inserting pre-chunked
|
||||
documents or using the RAG Tool.
|
||||
```python
|
||||
# You can insert a pre-chunked document directly into the vector db
|
||||
chunks = [
|
||||
{
|
||||
"document_id": "doc1",
|
||||
"content": "Your document text here",
|
||||
"mime_type": "text/plain",
|
||||
"metadata": {
|
||||
"document_id": "doc1",
|
||||
"author": "Jane Doe",
|
||||
},
|
||||
},
|
||||
]
|
||||
client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
|
||||
```
|
||||
|
||||
#### Using Precomputed Embeddings
|
||||
If you decide to precompute embeddings for your documents, you can insert them directly into the vector database by
|
||||
including the embedding vectors in the chunk data. This is useful if you have a separate embedding service or if you
|
||||
want to customize the ingestion process.
|
||||
```python
|
||||
chunks_with_embeddings = [
|
||||
{
|
||||
"content": "First chunk of text",
|
||||
"mime_type": "text/plain",
|
||||
"embedding": [0.1, 0.2, 0.3, ...], # Your precomputed embedding vector
|
||||
"metadata": {"document_id": "doc1", "section": "introduction"},
|
||||
},
|
||||
{
|
||||
"content": "Second chunk of text",
|
||||
"mime_type": "text/plain",
|
||||
"embedding": [0.2, 0.3, 0.4, ...], # Your precomputed embedding vector
|
||||
"metadata": {"document_id": "doc1", "section": "methodology"},
|
||||
},
|
||||
]
|
||||
client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks_with_embeddings)
|
||||
```
|
||||
When providing precomputed embeddings, ensure the embedding dimension matches the embedding_dimension specified when
|
||||
registering the vector database.
|
||||
|
||||
### Retrieval
|
||||
You can query the vector database to retrieve documents based on their embeddings.
|
||||
```python
|
||||
# You can then query for these chunks
|
||||
chunks_response = client.vector_io.query(
|
||||
vector_db_id=vector_db_id, query="What do you know about..."
|
||||
|
@ -93,9 +52,7 @@ chunks_response = client.vector_io.query(
|
|||
|
||||
### Using the RAG Tool
|
||||
|
||||
A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc.
|
||||
and automatically chunks them into smaller pieces. More examples for how to format a RAGDocument can be found in the
|
||||
[appendix](#more-ragdocument-examples).
|
||||
A better way to ingest documents is to use the RAG Tool. This tool allows you to ingest documents from URLs, files, etc. and automatically chunks them into smaller pieces.
|
||||
|
||||
```python
|
||||
from llama_stack_client import RAGDocument
|
||||
|
@ -124,17 +81,6 @@ results = client.tool_runtime.rag_tool.query(
|
|||
)
|
||||
```
|
||||
|
||||
You can configure how the RAG tool adds metadata to the context if you find it useful for your application. Simply add:
|
||||
```python
|
||||
# Query documents
|
||||
results = client.tool_runtime.rag_tool.query(
|
||||
vector_db_ids=[vector_db_id],
|
||||
content="What do you know about...",
|
||||
query_config={
|
||||
"chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
|
||||
},
|
||||
)
|
||||
```
|
||||
### Building RAG-Enhanced Agents
|
||||
|
||||
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
|
||||
|
@ -152,12 +98,6 @@ agent = Agent(
|
|||
"name": "builtin::rag/knowledge_search",
|
||||
"args": {
|
||||
"vector_db_ids": [vector_db_id],
|
||||
# Defaults
|
||||
"query_config": {
|
||||
"chunk_size_in_tokens": 512,
|
||||
"chunk_overlap_in_tokens": 0,
|
||||
"chunk_template": "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n",
|
||||
},
|
||||
},
|
||||
}
|
||||
],
|
||||
|
@ -222,38 +162,3 @@ for vector_db_id in client.vector_dbs.list():
|
|||
print(f"Unregistering vector database: {vector_db_id.identifier}")
|
||||
client.vector_dbs.unregister(vector_db_id=vector_db_id.identifier)
|
||||
```
|
||||
|
||||
### Appendix
|
||||
|
||||
#### More RAGDocument Examples
|
||||
```python
|
||||
from llama_stack_client import RAGDocument
|
||||
import base64
|
||||
|
||||
RAGDocument(document_id="num-0", content={"uri": "file://path/to/file"})
|
||||
RAGDocument(document_id="num-1", content="plain text")
|
||||
RAGDocument(
|
||||
document_id="num-2",
|
||||
content={
|
||||
"type": "text",
|
||||
"text": "plain text input",
|
||||
}, # for inputs that should be treated as text explicitly
|
||||
)
|
||||
RAGDocument(
|
||||
document_id="num-3",
|
||||
content={
|
||||
"type": "image",
|
||||
"image": {"url": {"uri": "https://mywebsite.com/image.jpg"}},
|
||||
},
|
||||
)
|
||||
B64_ENCODED_IMAGE = base64.b64encode(
|
||||
requests.get(
|
||||
"https://raw.githubusercontent.com/meta-llama/llama-stack/refs/heads/main/docs/_static/llama-stack.png"
|
||||
).content
|
||||
)
|
||||
RAGDocuemnt(
|
||||
document_id="num-4",
|
||||
content={"type": "image", "image": {"data": B64_ENCODED_IMAGE}},
|
||||
)
|
||||
```
|
||||
for more strongly typed interaction use the typed dicts found [here](https://github.com/meta-llama/llama-stack-client-python/blob/38cd91c9e396f2be0bec1ee96a19771582ba6f17/src/llama_stack_client/types/shared_params/document.py).
|
||||
|
|
|
@ -45,16 +45,14 @@ Here's an example that sends telemetry signals to all three sink types. Your con
|
|||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
sinks: ['console', 'sqlite', 'otel_trace', 'otel_metric']
|
||||
otel_trace_endpoint: "http://localhost:4318/v1/traces"
|
||||
otel_metric_endpoint: "http://localhost:4318/v1/metrics"
|
||||
sinks: ['console', 'sqlite', 'otel']
|
||||
otel_endpoint: "http://localhost:4318/v1/traces"
|
||||
sqlite_db_path: "/path/to/telemetry.db"
|
||||
```
|
||||
|
||||
### Jaeger to visualize traces
|
||||
|
||||
The `otel` sink works with any service compatible with the OpenTelemetry collector, traces and metrics has two separate endpoints.
|
||||
Let's use Jaeger to visualize this data.
|
||||
The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.
|
||||
|
||||
Start a Jaeger instance with the OTLP HTTP endpoint at 4318 and the Jaeger UI at 16686 using the following command:
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue