Merge branch 'main' into aidand-groq-tool-call-tweaks

This commit is contained in:
Ashwin Bharambe 2025-01-28 05:00:57 -08:00 committed by GitHub
commit 7421aa44e4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
267 changed files with 14707 additions and 16232 deletions

View file

@ -11,6 +11,10 @@ on:
jobs: jobs:
build-and-push: build-and-push:
runs-on: ubuntu-latest runs-on: ubuntu-latest
env:
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
TAVILY_SEARCH_API_KEY: ${{ secrets.TAVILY_SEARCH_API_KEY }}
permissions: permissions:
contents: read contents: read
packages: write packages: write
@ -32,7 +36,7 @@ jobs:
id: version id: version
run: | run: |
if [ "${{ github.event_name }}" = "push" ]; then if [ "${{ github.event_name }}" = "push" ]; then
echo "VERSION=0.0.63.dev20250114" >> $GITHUB_OUTPUT echo "VERSION=0.0.63.dev51206766" >> $GITHUB_OUTPUT
else else
echo "VERSION=${{ inputs.version }}" >> $GITHUB_OUTPUT echo "VERSION=${{ inputs.version }}" >> $GITHUB_OUTPUT
fi fi
@ -42,8 +46,11 @@ jobs:
# Function to check if version exists in a repository # Function to check if version exists in a repository
check_version() { check_version() {
local repo=$1 local repo=$1
local status_code=$(curl -s -o /dev/null -w "%{http_code}" "https://$repo.org/project/llama-stack/${{ steps.version.outputs.version }}") local VERSION_TO_CHECK=${{ steps.version.outputs.version }}
return $([ "$status_code" -eq 200 ]) echo "Checking version $VERSION_TO_CHECK in $repo"
result=$(curl -s "https://$repo.org/pypi/llama-stack/json" | jq --arg v "$VERSION_TO_CHECK" '.releases | has($v)')
echo "Result: $result"
return $([ "$result" = "true" ])
} }
# Check TestPyPI first, then PyPI # Check TestPyPI first, then PyPI
@ -60,6 +67,7 @@ jobs:
- name: Install llama-stack - name: Install llama-stack
run: | run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
if [ "${{ github.event_name }}" = "push" ]; then if [ "${{ github.event_name }}" = "push" ]; then
pip install -e . pip install -e .
else else
@ -72,12 +80,14 @@ jobs:
- name: Build docker image - name: Build docker image
run: | run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
echo "VERSION=${{ steps.version.outputs.version }}"
TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu") TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
for template in "${TEMPLATES[@]}"; do for template in "${TEMPLATES[@]}"; do
if [ "$PYPI_SOURCE" = "testpypi" ]; then if [ "$PYPI_SOURCE" = "testpypi" ]; then
TEST_PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type docker TEST_PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
else else
PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type docker PYPI_VERSION=${{ steps.version.outputs.version }} llama stack build --template $template --image-type container
fi fi
done done
@ -85,8 +95,45 @@ jobs:
run: | run: |
docker images docker images
# TODO (xiyan): make the following 2 steps into a matrix and test all templates other than fireworks
- name: Start up built docker image
run: |
cd distributions/fireworks
if [ "$PYPI_SOURCE" = "testpypi" ]; then
sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:test-${{ steps.version.outputs.version }}|' ./compose.yaml
else
sed -i 's|image: llamastack/distribution-fireworks|image: distribution-fireworks:${{ steps.version.outputs.version }}|' ./compose.yaml
fi
docker compose up -d
cd ..
# Wait for the container to start
timeout=300
while ! curl -s -f http://localhost:8321/v1/version > /dev/null && [ $timeout -gt 0 ]; do
echo "Waiting for endpoint to be available..."
sleep 5
timeout=$((timeout - 5))
done
if [ $timeout -le 0 ]; then
echo "Timeout waiting for endpoint to become available"
exit 1
fi
- name: Run simple models list test on docker server
run: |
curl http://localhost:8321/v1/models
# TODO (xiyan): figure out why client cannot find server but curl works
# - name: Run pytest on docker server
# run: |
# pip install pytest pytest-md-report
# export LLAMA_STACK_BASE_URL="http://localhost:8321"
# LLAMA_STACK_BASE_URL="http://localhost:8321" pytest -v tests/client-sdk/inference/test_inference.py --md-report --md-report-verbose=1
- name: Push to dockerhub - name: Push to dockerhub
run: | run: |
echo "PYPI_SOURCE=${PYPI_SOURCE}"
echo "VERSION=${{ steps.version.outputs.version }}"
TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu") TEMPLATES=("ollama" "bedrock" "remote-vllm" "fireworks" "together" "tgi" "meta-reference-gpu")
for template in "${TEMPLATES[@]}"; do for template in "${TEMPLATES[@]}"; do
if [ "$PYPI_SOURCE" = "testpypi" ]; then if [ "$PYPI_SOURCE" = "testpypi" ]; then
@ -94,6 +141,8 @@ jobs:
docker push llamastack/distribution-$template:test-${{ steps.version.outputs.version }} docker push llamastack/distribution-$template:test-${{ steps.version.outputs.version }}
else else
docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:${{ steps.version.outputs.version }} docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:${{ steps.version.outputs.version }}
docker tag distribution-$template:${{ steps.version.outputs.version }} llamastack/distribution-$template:latest
docker push llamastack/distribution-$template:${{ steps.version.outputs.version }} docker push llamastack/distribution-$template:${{ steps.version.outputs.version }}
docker push llamastack/distribution-$template:latest
fi fi
done done

View file

@ -238,7 +238,7 @@ jobs:
run: | run: |
pip install pytest nbval pip install pytest nbval
llama stack build --template together --image-type venv llama stack build --template together --image-type venv
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Building_AI_Applications.ipynb pytest -v -s --nbval-lax ./docs/getting_started.ipynb
pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb pytest -v -s --nbval-lax ./docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb
# TODO: add trigger for integration test workflow & docker builds # TODO: add trigger for integration test workflow & docker builds

View file

@ -12,6 +12,57 @@ We actively welcome your pull requests.
5. Make sure your code lints. 5. Make sure your code lints.
6. If you haven't already, complete the Contributor License Agreement ("CLA"). 6. If you haven't already, complete the Contributor License Agreement ("CLA").
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Meta's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Pre-commit Hooks
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash
$ cd llama-stack
$ conda activate <your-environment>
$ pip install pre-commit
$ pre-commit install
```
After that, pre-commit hooks will run automatically before each commit.
## Coding Style
* 2 spaces for indentation rather than tabs
* 80 character line length
* ...
## Common Tasks
Some tips about common tasks you work on while contributing to Llama Stack:
### Using `llama stack build`
Building a stack image (conda / docker) will use the production version of the `llama-stack`, `llama-models` and `llama-stack-client` packages. If you are developing with a llama-stack repository checked out and need your code to be reflected in the stack image, set `LLAMA_STACK_DIR` and `LLAMA_MODELS_DIR` to the appropriate checked out directories when running any of the `llama` CLI commands.
Example:
```bash
$ cd work/
$ git clone https://github.com/meta-llama/llama-stack.git
$ git clone https://github.com/meta-llama/llama-models.git
$ cd llama-stack
$ LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack build --template <...>
```
### Updating Provider Configurations ### Updating Provider Configurations
@ -31,40 +82,6 @@ make html
sphinx-autobuild source build/html sphinx-autobuild source build/html
``` ```
## Pre-commit Hooks
We use [pre-commit](https://pre-commit.com/) to run linting and formatting checks on your code. You can install the pre-commit hooks by running:
```bash
$ cd llama-stack
$ conda activate <your-environment>
$ pip install pre-commit
$ pre-commit install
```
After that, pre-commit hooks will run automatically before each commit.
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You only need
to do this once to work on any of Meta's open source projects.
Complete your CLA here: <https://code.facebook.com/cla>
## Issues
We use GitHub issues to track public bugs. Please ensure your description is
clear and has sufficient instructions to be able to reproduce the issue.
Meta has a [bounty program](http://facebook.com/whitehat/info) for the safe
disclosure of security bugs. In those cases, please go through the process
outlined on that page and do not file a public issue.
## Coding Style
* 2 spaces for indentation rather than tabs
* 80 character line length
* ...
## Tips
* If you are developing with a llama-stack repository checked out and need your distribution to reflect changes from there, set `LLAMA_STACK_DIR` to that dir when running any of the `llama` CLI commands.
## License ## License
By contributing to Llama, you agree that your contributions will be licensed By contributing to Llama, you agree that your contributions will be licensed

View file

@ -4,9 +4,15 @@
[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-stack)](https://pypi.org/project/llama-stack/)
[![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack) [![Discord](https://img.shields.io/discord/1257833999603335178)](https://discord.gg/llama-stack)
[**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Zero-to-Hero Guide**](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) [**Quick Start**](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) | [**Documentation**](https://llama-stack.readthedocs.io/en/latest/index.html) | [**Colab Notebook**](./docs/getting_started.ipynb)
Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations. Llama Stack defines and standardizes the core building blocks that simplify AI application development. It codified best practices across the Llama ecosystem. More specifically, it provides
- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack
<div style="text-align: center;"> <div style="text-align: center;">
<img <img
@ -17,69 +23,20 @@ Llama Stack defines and standardizes the set of core building blocks needed to b
/> />
</div> </div>
Our goal is to provide pre-packaged implementations which can be operated in a variety of deployment environments: developers start iterating with Desktops or their mobile devices and can seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available. ### Llama Stack Benefits
- **Flexible Options**: Developers can choose their preferred infrastructure without changing APIs and enjoy flexible deployment choice.
- **Consistent Experience**: With its unified APIs Llama Stack makes it easier to build, test, and deploy AI applications with consistent application behavior.
- **Robust Ecosystem**: Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies) that offer tailored infrastructure, software, and services for deploying Llama models.
> ⚠️ **Note** By reducing friction and complexity, Llama Stack empowers developers to focus on what they do best: building transformative generative AI applications.
> The Stack APIs are rapidly improving, but still very much work in progress and we invite feedback as well as direct contributions.
## APIs
We have working implementations of the following APIs today:
- Inference
- Safety
- Memory
- Agents
- Eval
- Telemetry
Alongside these APIs, we also related APIs for operating with associated resources (see [Concepts](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#resources)):
- Models
- Shields
- Memory Banks
- Eval Tasks
- Datasets
- Scoring Functions
We are also working on the following APIs which will be released soon:
- Post Training
- Synthetic Data Generation
- Reward Scoring
Each of the APIs themselves is a collection of REST endpoints.
## Philosophy
### Service-oriented design
Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from a local to remote deployments, but also forces the design to be more declarative. We believe this restriction can result in a much simpler, robust developer experience. This will necessarily trade-off against expressivity however if we get the APIs right, it can lead to a very powerful platform.
### Composability
We expect the set of APIs we design to be composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API.
### Turnkey one-stop solutions
We expect to provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or on a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations or fine-tuning services in a matter of minutes. They should all result in the same uniform observability and developer experience.
### Focus on Llama models
As a Meta initiated project, we have started by explicitly focusing on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best.
### Supporting the Ecosystem
There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem.
Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated.
## Supported Llama Stack Implementations
### API Providers ### API Providers
Here is a list of the various API providers and available distributions to developers started easily,
| **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** | | **API Provider Builder** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:| |:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
| Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | Meta Reference | Single Node | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
| SambaNova | Hosted | | :heavy_check_mark: | | | |
| Cerebras | Hosted | | :heavy_check_mark: | | | | | Cerebras | Hosted | | :heavy_check_mark: | | | |
| Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | | | Fireworks | Hosted | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | |
| AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | | | AWS Bedrock | Hosted | | :heavy_check_mark: | | :heavy_check_mark: | |
@ -87,26 +44,29 @@ Additionally, we have designed every element of the Stack such that APIs as well
| Groq | Hosted | | :heavy_check_mark: | | | | | Groq | Hosted | | :heavy_check_mark: | | | |
| Ollama | Single Node | | :heavy_check_mark: | | | | | Ollama | Single Node | | :heavy_check_mark: | | | |
| TGI | Hosted and Single Node | | :heavy_check_mark: | | | | | TGI | Hosted and Single Node | | :heavy_check_mark: | | | |
| [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) | Hosted and Single Node | | :heavy_check_mark: | | | | | NVIDIA NIM | Hosted and Single Node | | :heavy_check_mark: | | | |
| Chroma | Single Node | | | :heavy_check_mark: | | | | Chroma | Single Node | | | :heavy_check_mark: | | |
| PG Vector | Single Node | | | :heavy_check_mark: | | | | PG Vector | Single Node | | | :heavy_check_mark: | | |
| PyTorch ExecuTorch | On-device iOS | :heavy_check_mark: | :heavy_check_mark: | | | | | PyTorch ExecuTorch | On-device iOS | :heavy_check_mark: | :heavy_check_mark: | | | |
| [vLLM](https://github.com/vllm-project/vllm) | Hosted and Single Node | | :heavy_check_mark: | | | | | vLLM | Hosted and Single Node | | :heavy_check_mark: | | | |
### Distributions ### Distributions
A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider implementations for each API component. Distributions make it easy to get started with a specific deployment scenario - you can begin with a local development setup (eg. ollama) and seamlessly transition to production (eg. Fireworks) without changing your application code. Here are some of the distributions we support:
| **Distribution** | **Llama Stack Docker** | Start This Distribution | | **Distribution** | **Llama Stack Docker** | Start This Distribution |
|:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:| |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:|
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) | | Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) |
| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) | | Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-quantized-gpu.html) |
| SambaNova | [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html) |
| Cerebras | [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html) | | Cerebras | [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html) |
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html) | | Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html) |
| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html) | | TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html) |
| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html) | | Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html) |
| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html) | | Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html) |
| [vLLM](https://github.com/vllm-project/vllm) | [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) | | vLLM | [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) |
## Installation ### Installation
You have two ways to install this repository: You have two ways to install this repository:
@ -131,7 +91,7 @@ You have two ways to install this repository:
pip install -e . pip install -e .
``` ```
## Documentation ### Documentation
Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details. Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest/index.html) page for more details.
@ -139,13 +99,13 @@ Please checkout our [Documentation](https://llama-stack.readthedocs.io/en/latest
* Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution. * Guide using `llama` CLI to work with Llama models (download, study prompts), and building/starting a Llama Stack distribution.
* [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html) * [Getting Started](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html)
* Quick guide to start a Llama Stack server. * Quick guide to start a Llama Stack server.
* [Jupyter notebook](./docs/notebooks/Llama_Stack_Building_AI_Applications.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs * [Jupyter notebook](./docs/getting_started.ipynb) to walk-through how to use simple text and vision inference llama_stack_client APIs
* The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack). * The complete Llama Stack lesson [Colab notebook](https://colab.research.google.com/drive/1dtVmxotBsI4cGZQNsJRYPrLiDeT0Wnwt) of the new [Llama 3.2 course on Deeplearning.ai](https://learn.deeplearning.ai/courses/introducing-multimodal-llama-3-2/lesson/8/llama-stack).
* A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples. * A [Zero-to-Hero Guide](https://github.com/meta-llama/llama-stack/tree/main/docs/zero_to_hero_guide) that guide you through all the key components of llama stack with code samples.
* [Contributing](CONTRIBUTING.md) * [Contributing](CONTRIBUTING.md)
* [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider. * [Adding a new API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) to walk-through how to add a new API provider.
## Llama Stack Client SDKs ### Llama Stack Client SDKs
| **Language** | **Client SDK** | **Package** | | **Language** | **Client SDK** | **Package** |
| :----: | :----: | :----: | | :----: | :----: | :----: |

View file

@ -1,6 +1,6 @@
version: '2' version: '2'
image_name: local image_name: local
docker_image: null container_image: null
conda_env: local conda_env: local
apis: apis:
- shields - shields

View file

@ -1,4 +1,34 @@
{ {
"sambanova": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"openai",
"opentelemetry-exporter-otlp-proto-http",
"opentelemetry-sdk",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"requests",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"hf-serverless": [ "hf-serverless": [
"aiohttp", "aiohttp",
"aiosqlite", "aiosqlite",
@ -13,6 +43,7 @@
"httpx", "httpx",
"huggingface_hub", "huggingface_hub",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -45,6 +76,7 @@
"fire", "fire",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -78,6 +110,7 @@
"fire", "fire",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -101,14 +134,17 @@
], ],
"remote-vllm": [ "remote-vllm": [
"aiosqlite", "aiosqlite",
"autoevals",
"blobfile", "blobfile",
"chardet", "chardet",
"chromadb-client", "chromadb-client",
"datasets",
"faiss-cpu", "faiss-cpu",
"fastapi", "fastapi",
"fire", "fire",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -142,6 +178,7 @@
"fireworks-ai", "fireworks-ai",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -176,6 +213,7 @@
"httpx", "httpx",
"huggingface_hub", "huggingface_hub",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -209,6 +247,7 @@
"fire", "fire",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -244,6 +283,7 @@
"httpx", "httpx",
"lm-format-enforcer", "lm-format-enforcer",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -279,6 +319,7 @@
"fire", "fire",
"httpx", "httpx",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -315,6 +356,7 @@
"httpx", "httpx",
"lm-format-enforcer", "lm-format-enforcer",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",
@ -421,6 +463,7 @@
"httpx", "httpx",
"huggingface_hub", "huggingface_hub",
"matplotlib", "matplotlib",
"mcp",
"nltk", "nltk",
"numpy", "numpy",
"openai", "openai",

View file

@ -1,13 +1,11 @@
services: services:
llamastack: llamastack:
image: llamastack/distribution-fireworks image: llamastack/distribution-fireworks
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-fireworks.yaml
ports: ports:
- "8321:8321" - "8321:8321"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml" environment:
- FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
deploy: deploy:
restart_policy: restart_policy:
condition: on-failure condition: on-failure

View file

@ -1,6 +1,6 @@
version: '2' version: '2'
image_name: local image_name: local
docker_image: null container_image: null
conda_env: local conda_env: local
apis: apis:
- shields - shields

View file

@ -0,0 +1,9 @@
name: runpod
distribution_spec:
description: Use Runpod for running LLM inference
providers:
inference: remote::runpod
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1 @@
../../llama_stack/templates/sambanova/build.yaml

View file

@ -0,0 +1,16 @@
services:
llamastack:
image: llamastack/distribution-sambanova
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-sambanova.yaml
ports:
- "5000:5000"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1 @@
../../llama_stack/templates/sambanova/run.yaml

View file

@ -1,13 +1,11 @@
services: services:
llamastack: llamastack:
image: llamastack/distribution-together image: llamastack/distribution-together
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/llamastack-run-together.yaml
ports: ports:
- "8321:8321" - "8321:8321"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml" environment:
- TOGETHER_API_KEY=${TOGETHER_API_KEY}
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
deploy: deploy:
restart_policy: restart_policy:
condition: on-failure condition: on-failure

View file

@ -1,6 +1,6 @@
version: '2' version: '2'
image_name: local image_name: local
docker_image: null container_image: null
conda_env: local conda_env: local
apis: apis:
- shields - shields

File diff suppressed because one or more lines are too long

View file

@ -369,7 +369,7 @@
"- telemetry\n", "- telemetry\n",
"- tool_runtime\n", "- tool_runtime\n",
"datasets: <span style=\"font-weight: bold\">[]</span>\n", "datasets: <span style=\"font-weight: bold\">[]</span>\n",
"docker_image: null\n", "container_image: null\n",
"eval_tasks: <span style=\"font-weight: bold\">[]</span>\n", "eval_tasks: <span style=\"font-weight: bold\">[]</span>\n",
"image_name: together\n", "image_name: together\n",
"memory_banks: <span style=\"font-weight: bold\">[]</span>\n", "memory_banks: <span style=\"font-weight: bold\">[]</span>\n",
@ -513,8 +513,8 @@
" provider_id: code-interpreter\n", " provider_id: code-interpreter\n",
" provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::c</span>ode-interpreter\n", " provider_type: inlin<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-weight: bold\">e::c</span>ode-interpreter\n",
" - config: <span style=\"font-weight: bold\">{}</span>\n", " - config: <span style=\"font-weight: bold\">{}</span>\n",
" provider_id: memory-runtime\n", " provider_id: rag-runtime\n",
" provider_type: inline::memory-runtime\n", " provider_type: inline::rag-runtime\n",
"scoring_fns: <span style=\"font-weight: bold\">[]</span>\n", "scoring_fns: <span style=\"font-weight: bold\">[]</span>\n",
"shields:\n", "shields:\n",
"- params: null\n", "- params: null\n",
@ -528,8 +528,8 @@
" toolgroup_id: builtin::websearch\n", " toolgroup_id: builtin::websearch\n",
"- args: null\n", "- args: null\n",
" mcp_endpoint: null\n", " mcp_endpoint: null\n",
" provider_id: memory-runtime\n", " provider_id: rag-runtime\n",
" toolgroup_id: builtin::memory\n", " toolgroup_id: builtin::rag\n",
"- args: null\n", "- args: null\n",
" mcp_endpoint: null\n", " mcp_endpoint: null\n",
" provider_id: code-interpreter\n", " provider_id: code-interpreter\n",
@ -550,7 +550,7 @@
"- telemetry\n", "- telemetry\n",
"- tool_runtime\n", "- tool_runtime\n",
"datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "datasets: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"docker_image: null\n", "container_image: null\n",
"eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "eval_tasks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"image_name: together\n", "image_name: together\n",
"memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "memory_banks: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
@ -694,8 +694,8 @@
" provider_id: code-interpreter\n", " provider_id: code-interpreter\n",
" provider_type: inlin\u001b[1;92me::c\u001b[0mode-interpreter\n", " provider_type: inlin\u001b[1;92me::c\u001b[0mode-interpreter\n",
" - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n", " - config: \u001b[1m{\u001b[0m\u001b[1m}\u001b[0m\n",
" provider_id: memory-runtime\n", " provider_id: rag-runtime\n",
" provider_type: inline::memory-runtime\n", " provider_type: inline::rag-runtime\n",
"scoring_fns: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", "scoring_fns: \u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n",
"shields:\n", "shields:\n",
"- params: null\n", "- params: null\n",
@ -709,8 +709,8 @@
" toolgroup_id: builtin::websearch\n", " toolgroup_id: builtin::websearch\n",
"- args: null\n", "- args: null\n",
" mcp_endpoint: null\n", " mcp_endpoint: null\n",
" provider_id: memory-runtime\n", " provider_id: rag-runtime\n",
" toolgroup_id: builtin::memory\n", " toolgroup_id: builtin::rag\n",
"- args: null\n", "- args: null\n",
" mcp_endpoint: null\n", " mcp_endpoint: null\n",
" provider_id: code-interpreter\n", " provider_id: code-interpreter\n",

File diff suppressed because one or more lines are too long

View file

@ -172,10 +172,16 @@ def _get_endpoint_functions(
def _get_defining_class(member_fn: str, derived_cls: type) -> type: def _get_defining_class(member_fn: str, derived_cls: type) -> type:
"Find the class in which a member function is first defined in a class inheritance hierarchy." "Find the class in which a member function is first defined in a class inheritance hierarchy."
# This import must be dynamic here
from llama_stack.apis.tools import RAGToolRuntime, ToolRuntime
# iterate in reverse member resolution order to find most specific class first # iterate in reverse member resolution order to find most specific class first
for cls in reversed(inspect.getmro(derived_cls)): for cls in reversed(inspect.getmro(derived_cls)):
for name, _ in inspect.getmembers(cls, inspect.isfunction): for name, _ in inspect.getmembers(cls, inspect.isfunction):
if name == member_fn: if name == member_fn:
# HACK ALERT
if cls == RAGToolRuntime:
return ToolRuntime
return cls return cls
raise ValidationError( raise ValidationError(

View file

@ -122,9 +122,16 @@ class JsonSchemaAnyOf(JsonSchemaNode):
anyOf: List["JsonSchemaAny"] anyOf: List["JsonSchemaAny"]
@dataclass
class Discriminator:
propertyName: str
mapping: Dict[str, str]
@dataclass @dataclass
class JsonSchemaOneOf(JsonSchemaNode): class JsonSchemaOneOf(JsonSchemaNode):
oneOf: List["JsonSchemaAny"] oneOf: List["JsonSchemaAny"]
discriminator: Optional[Discriminator]
JsonSchemaAny = Union[ JsonSchemaAny = Union[

View file

@ -36,6 +36,7 @@ from typing import (
) )
import jsonschema import jsonschema
from typing_extensions import Annotated
from . import docstring from . import docstring
from .auxiliary import ( from .auxiliary import (
@ -329,7 +330,6 @@ class JsonSchemaGenerator:
if metadata is not None: if metadata is not None:
# type is Annotated[T, ...] # type is Annotated[T, ...]
typ = typing.get_args(data_type)[0] typ = typing.get_args(data_type)[0]
schema = self._simple_type_to_schema(typ) schema = self._simple_type_to_schema(typ)
if schema is not None: if schema is not None:
# recognize well-known auxiliary types # recognize well-known auxiliary types
@ -446,12 +446,31 @@ class JsonSchemaGenerator:
], ],
} }
elif origin_type is Union: elif origin_type is Union:
return { discriminator = None
if typing.get_origin(data_type) is Annotated:
discriminator = typing.get_args(data_type)[1].discriminator
ret = {
"oneOf": [ "oneOf": [
self.type_to_schema(union_type) self.type_to_schema(union_type)
for union_type in typing.get_args(typ) for union_type in typing.get_args(typ)
] ]
} }
if discriminator:
# for each union type, we need to read the value of the discriminator
mapping = {}
for union_type in typing.get_args(typ):
props = self.type_to_schema(union_type, force_expand=True)[
"properties"
]
mapping[props[discriminator]["default"]] = self.type_to_schema(
union_type
)["$ref"]
ret["discriminator"] = {
"propertyName": discriminator,
"mapping": mapping,
}
return ret
elif origin_type is Literal: elif origin_type is Literal:
(literal_value,) = typing.get_args(typ) # unpack value of literal type (literal_value,) = typing.get_args(typ) # unpack value of literal type
schema = self.type_to_schema(type(literal_value)) schema = self.type_to_schema(type(literal_value))

11
docs/readme.md Normal file
View file

@ -0,0 +1,11 @@
# Llama Stack Documentation
Here's a collection of comprehensive guides, examples, and resources for building AI applications with Llama Stack. For the complete documentation, visit our [ReadTheDocs page](https://llama-stack.readthedocs.io/en/latest/index.html).
## Content
Try out Llama Stack's capabilities through our detailed Jupyter notebooks:
* [Building AI Applications Notebook](./notebooks/Llama_Stack_Building_AI_Applications.ipynb) - A comprehensive guide to building production-ready AI applications using Llama Stack
* [Benchmark Evaluations Notebook](./notebooks/Llama_Stack_Benchmark_Evals.ipynb) - Detailed performance evaluations and benchmarking results
* [Zero-to-Hero Guide](./notebooks/Llama_Stack_Zero_to_Hero_Guide.ipynb) - Step-by-step guide for getting started with Llama Stack

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,123 @@
## Agent Execution Loop
Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
Each agent turn follows these key steps:
1. **Initial Safety Check**: The user's input is first screened through configured safety shields
2. **Context Retrieval**:
- If RAG is enabled, the agent queries relevant documents from memory banks
- For new documents, they are first inserted into the memory bank
- Retrieved context is augmented to the user's prompt
3. **Inference Loop**: The agent enters its main execution loop:
- The LLM receives the augmented prompt (with context and/or previous tool outputs)
- The LLM generates a response, potentially with tool calls
- If tool calls are present:
- Tool inputs are safety-checked
- Tools are executed (e.g., web search, code execution)
- Tool responses are fed back to the LLM for synthesis
- The loop continues until:
- The LLM provides a final response without tool calls
- Maximum iterations are reached
- Token limit is exceeded
4. **Final Safety Check**: The agent's final response is screened through safety shields
```{mermaid}
sequenceDiagram
participant U as User
participant E as Executor
participant M as Memory Bank
participant L as LLM
participant T as Tools
participant S as Safety Shield
Note over U,S: Agent Turn Start
U->>S: 1. Submit Prompt
activate S
S->>E: Input Safety Check
deactivate S
E->>M: 2.1 Query Context
M-->>E: 2.2 Retrieved Documents
loop Inference Loop
E->>L: 3.1 Augment with Context
L-->>E: 3.2 Response (with/without tool calls)
alt Has Tool Calls
E->>S: Check Tool Input
S->>T: 4.1 Execute Tool
T-->>E: 4.2 Tool Response
E->>L: 5.1 Tool Response
L-->>E: 5.2 Synthesized Response
end
opt Stop Conditions
Note over E: Break if:
Note over E: - No tool calls
Note over E: - Max iterations reached
Note over E: - Token limit exceeded
end
end
E->>S: Output Safety Check
S->>U: 6. Final Response
```
Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
```python
from llama_stack_client.lib.agents.event_logger import EventLogger
agent_config = AgentConfig(
model="Llama3.2-3B-Instruct",
instructions="You are a helpful assistant",
# Enable both RAG and tool usage
toolgroups=[
{"name": "builtin::rag", "args": {"vector_db_ids": ["my_docs"]}}.
"builtin::code_interpreter",
],
# Configure safety
input_shields=["llama_guard"],
output_shields=["llama_guard"],
# Control the inference loop
max_infer_iters=5,
sampling_params={
"strategy": {
"type": "top_p",
"temperature": 0.7,
"top_p": 0.95
},
"max_tokens": 2048
}
)
agent = Agent(client, agent_config)
session_id = agent.create_session("monitored_session")
# Stream the agent's execution steps
response = agent.create_turn(
messages=[{"role": "user", "content": "Analyze this code and run it"}],
attachments=[{
"content": "https://raw.githubusercontent.com/example/code.py",
"mime_type": "text/plain"
}],
session_id=session_id
)
# Monitor each step of execution
for log in EventLogger().log(response):
if log.event.step_type == "memory_retrieval":
print("Retrieved context:", log.event.retrieved_context)
elif log.event.step_type == "inference":
print("LLM output:", log.event.model_response)
elif log.event.step_type == "tool_execution":
print("Tool call:", log.event.tool_call)
print("Tool response:", log.event.tool_response)
elif log.event.step_type == "shield_call":
if log.event.violation:
print("Safety violation:", log.event.violation)
```

View file

@ -1,8 +1,8 @@
# Benchmark Evaluations # Evals
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing)
Llama Stack provides the building blocks needed to run benchmark and application evaluations. This guide will walk you through how to use these components to run open benchmark evaluations. Visit our [Evaluation Concepts](../concepts/evaluation_concepts.md) guide for more details on how evaluations work in Llama Stack, and our [Evaluation Reference](../references/evals_reference/index.md) guide for a comprehensive reference on the APIs. Check out our [Colab notebook](https://colab.research.google.com/drive/10CHyykee9j2OigaIcRv47BKG9mrNm0tJ?usp=sharing) on working examples on how you can use Llama Stack for running benchmark evaluations. Llama Stack provides the building blocks needed to run benchmark and application evaluations. This guide will walk you through how to use these components to run open benchmark evaluations. Visit our [Evaluation Concepts](../concepts/evaluation_concepts.md) guide for more details on how evaluations work in Llama Stack, and our [Evaluation Reference](../references/evals_reference/index.md) guide for a comprehensive reference on the APIs.
### 1. Open Benchmark Model Evaluation ### 1. Open Benchmark Model Evaluation

View file

@ -0,0 +1,36 @@
## Testing & Evaluation
Llama Stack provides built-in tools for evaluating your applications:
1. **Benchmarking**: Test against standard datasets
2. **Application Evaluation**: Score your application's outputs
3. **Custom Metrics**: Define your own evaluation criteria
Here's how to set up basic evaluation:
```python
# Create an evaluation task
response = client.eval_tasks.register(
eval_task_id="my_eval",
dataset_id="my_dataset",
scoring_functions=["accuracy", "relevance"]
)
# Run evaluation
job = client.eval.run_eval(
task_id="my_eval",
task_config={
"type": "app",
"eval_candidate": {
"type": "agent",
"config": agent_config
}
}
)
# Get results
result = client.eval.job_result(
task_id="my_eval",
job_id=job.job_id
)
```

View file

@ -1,425 +1,29 @@
# Building AI Applications # Building AI Applications
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1F2ksmkoGQPa4pzRjMOE6BXWeOxWFIW6n?usp=sharing) Llama Stack provides all the building blocks needed to create sophisticated AI applications.
Llama Stack provides all the building blocks needed to create sophisticated AI applications. This guide will walk you through how to use these components effectively. Check out our Colab notebook on to follow along working examples on how you can build LLM-powered agentic applications using Llama Stack. The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
## Basic Inference **Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
The foundation of any AI application is the ability to interact with LLM models. Llama Stack provides a simple interface for both completion and chat-based inference: Here are some key topics that will help you build effective agents:
```python - **[Agent Execution Loop](agent_execution_loop)**
from llama_stack_client import LlamaStackClient - **[RAG](rag)**
- **[Safety](safety)**
- **[Tools](tools)**
- **[Telemetry](telemetry)**
- **[Evals](evals)**
client = LlamaStackClient(base_url="http://localhost:5001")
# List available models
models = client.models.list()
# Simple chat completion
response = client.inference.chat_completion(
model_id="Llama3.2-3B-Instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write a haiku about coding"}
]
)
print(response.completion_message.content)
```
## Adding Memory & RAG
Memory enables your applications to reference and recall information from previous interactions or external documents. Llama Stack's memory system is built around the concept of Memory Banks:
1. **Vector Memory Banks**: For semantic search and retrieval
2. **Key-Value Memory Banks**: For structured data storage
3. **Keyword Memory Banks**: For basic text search
4. **Graph Memory Banks**: For relationship-based retrieval
Here's how to set up a vector memory bank for RAG:
```python
# Register a memory bank
bank_id = "my_documents"
response = client.memory_banks.register(
memory_bank_id=bank_id,
params={
"memory_bank_type": "vector",
"embedding_model": "all-MiniLM-L6-v2",
"chunk_size_in_tokens": 512
}
)
# Insert documents
documents = [
{
"document_id": "doc1",
"content": "Your document text here",
"mime_type": "text/plain"
}
]
client.memory.insert(bank_id, documents)
# Query documents
results = client.memory.query(
bank_id=bank_id,
query="What do you know about...",
)
```
## Implementing Safety Guardrails
Safety is a critical component of any AI application. Llama Stack provides a Shield system that can be applied at multiple touchpoints:
```python
# Register a safety shield
shield_id = "content_safety"
client.shields.register(
shield_id=shield_id,
provider_shield_id="llama-guard-basic"
)
# Run content through shield
response = client.safety.run_shield(
shield_id=shield_id,
messages=[{"role": "user", "content": "User message here"}]
)
if response.violation:
print(f"Safety violation detected: {response.violation.user_message}")
```
## Building Agents
Agents are the heart of complex AI applications. They combine inference, memory, safety, and tool usage into coherent workflows. At its core, an agent follows a sophisticated execution loop that enables multi-step reasoning, tool usage, and safety checks.
### The Agent Execution Loop
Each agent turn follows these key steps:
1. **Initial Safety Check**: The user's input is first screened through configured safety shields
2. **Context Retrieval**:
- If RAG is enabled, the agent queries relevant documents from memory banks
- For new documents, they are first inserted into the memory bank
- Retrieved context is augmented to the user's prompt
3. **Inference Loop**: The agent enters its main execution loop:
- The LLM receives the augmented prompt (with context and/or previous tool outputs)
- The LLM generates a response, potentially with tool calls
- If tool calls are present:
- Tool inputs are safety-checked
- Tools are executed (e.g., web search, code execution)
- Tool responses are fed back to the LLM for synthesis
- The loop continues until:
- The LLM provides a final response without tool calls
- Maximum iterations are reached
- Token limit is exceeded
4. **Final Safety Check**: The agent's final response is screened through safety shields
```{mermaid}
sequenceDiagram
participant U as User
participant E as Executor
participant M as Memory Bank
participant L as LLM
participant T as Tools
participant S as Safety Shield
Note over U,S: Agent Turn Start
U->>S: 1. Submit Prompt
activate S
S->>E: Input Safety Check
deactivate S
E->>M: 2.1 Query Context
M-->>E: 2.2 Retrieved Documents
loop Inference Loop
E->>L: 3.1 Augment with Context
L-->>E: 3.2 Response (with/without tool calls)
alt Has Tool Calls
E->>S: Check Tool Input
S->>T: 4.1 Execute Tool
T-->>E: 4.2 Tool Response
E->>L: 5.1 Tool Response
L-->>E: 5.2 Synthesized Response
end
opt Stop Conditions
Note over E: Break if:
Note over E: - No tool calls
Note over E: - Max iterations reached
Note over E: - Token limit exceeded
end
end
E->>S: Output Safety Check
S->>U: 6. Final Response
```
Each step in this process can be monitored and controlled through configurations. Here's an example that demonstrates monitoring the agent's execution:
```python
from llama_stack_client.lib.agents.event_logger import EventLogger
agent_config = AgentConfig(
model="Llama3.2-3B-Instruct",
instructions="You are a helpful assistant",
# Enable both RAG and tool usage
tools=[
{
"type": "memory",
"memory_bank_configs": [{
"type": "vector",
"bank_id": "my_docs"
}],
"max_tokens_in_context": 4096
},
{
"type": "code_interpreter",
"enable_inline_code_execution": True
}
],
# Configure safety
input_shields=["content_safety"],
output_shields=["content_safety"],
# Control the inference loop
max_infer_iters=5,
sampling_params={
"strategy": {
"type": "top_p",
"temperature": 0.7,
"top_p": 0.95
},
"max_tokens": 2048
}
)
agent = Agent(client, agent_config)
session_id = agent.create_session("monitored_session")
# Stream the agent's execution steps
response = agent.create_turn(
messages=[{"role": "user", "content": "Analyze this code and run it"}],
attachments=[{
"content": "https://raw.githubusercontent.com/example/code.py",
"mime_type": "text/plain"
}],
session_id=session_id
)
# Monitor each step of execution
for log in EventLogger().log(response):
if log.event.step_type == "memory_retrieval":
print("Retrieved context:", log.event.retrieved_context)
elif log.event.step_type == "inference":
print("LLM output:", log.event.model_response)
elif log.event.step_type == "tool_execution":
print("Tool call:", log.event.tool_call)
print("Tool response:", log.event.tool_response)
elif log.event.step_type == "shield_call":
if log.event.violation:
print("Safety violation:", log.event.violation)
```
This example shows how an agent can: Llama Stack provides a high-level agent framework:
```python
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.types.agent_create_params import AgentConfig
# Configure an agent
agent_config = AgentConfig(
model="Llama3.2-3B-Instruct",
instructions="You are a helpful assistant",
tools=[
{
"type": "memory",
"memory_bank_configs": [],
"query_generator_config": {
"type": "default",
"sep": " "
}
}
],
input_shields=["content_safety"],
output_shields=["content_safety"],
enable_session_persistence=True
)
# Create an agent
agent = Agent(client, agent_config)
session_id = agent.create_session("my_session")
# Run agent turns
response = agent.create_turn(
messages=[{"role": "user", "content": "Your question here"}],
session_id=session_id
)
```
### Adding Tools to Agents
Agents can be enhanced with various tools:
1. **Search**: Web search capabilities through providers like Brave
2. **Code Interpreter**: Execute code snippets
3. **RAG**: Memory and document retrieval
4. **Function Calling**: Custom function execution
5. **WolframAlpha**: Mathematical computations
6. **Photogen**: Image generation
Example of configuring an agent with tools:
```python
agent_config = AgentConfig(
model="Llama3.2-3B-Instruct",
tools=[
{
"type": "brave_search",
"api_key": "YOUR_API_KEY",
"engine": "brave"
},
{
"type": "code_interpreter",
"enable_inline_code_execution": True
}
],
tool_choice="auto",
tool_prompt_format="json"
)
```
## Building RAG-Enhanced Agents
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
```python
from llama_stack_client.types import Attachment
# Create attachments from documents
attachments = [
Attachment(
content="https://raw.githubusercontent.com/example/doc.rst",
mime_type="text/plain"
)
]
# Configure agent with memory
agent_config = AgentConfig(
model="Llama3.2-3B-Instruct",
instructions="You are a helpful assistant",
tools=[{
"type": "memory",
"memory_bank_configs": [],
"query_generator_config": {"type": "default", "sep": " "},
"max_tokens_in_context": 4096,
"max_chunks": 10
}],
enable_session_persistence=True
)
agent = Agent(client, agent_config)
session_id = agent.create_session("rag_session")
# Initial document ingestion
response = agent.create_turn(
messages=[{
"role": "user",
"content": "I am providing some documents for reference."
}],
attachments=attachments,
session_id=session_id
)
# Query with RAG
response = agent.create_turn(
messages=[{
"role": "user",
"content": "What are the key topics in the documents?"
}],
session_id=session_id
)
```
## Testing & Evaluation
Llama Stack provides built-in tools for evaluating your applications:
1. **Benchmarking**: Test against standard datasets
2. **Application Evaluation**: Score your application's outputs
3. **Custom Metrics**: Define your own evaluation criteria
Here's how to set up basic evaluation:
```python
# Create an evaluation task
response = client.eval_tasks.register(
eval_task_id="my_eval",
dataset_id="my_dataset",
scoring_functions=["accuracy", "relevance"]
)
# Run evaluation
job = client.eval.run_eval(
task_id="my_eval",
task_config={
"type": "app",
"eval_candidate": {
"type": "agent",
"config": agent_config
}
}
)
# Get results
result = client.eval.job_result(
task_id="my_eval",
job_id=job.job_id
)
```
## Debugging & Monitoring
Llama Stack includes comprehensive telemetry for debugging and monitoring your applications:
1. **Tracing**: Track request flows across components
2. **Metrics**: Measure performance and usage
3. **Logging**: Debug issues and track behavior
The telemetry system supports multiple output formats:
- OpenTelemetry for visualization in tools like Jaeger
- SQLite for local storage and querying
- Console output for development
Example of querying traces:
```python
# Query traces for a session
traces = client.telemetry.query_traces(
attribute_filters=[{
"key": "session_id",
"op": "eq",
"value": session_id
}]
)
# Get spans within the root span; indexed by ID
# Use parent_span_id to build a tree out of it
spans_by_id = client.telemetry.get_span_tree(
span_id=traces[0].root_span_id
)
```
For details on how to use the telemetry system to debug your applications, export traces to a dataset, and run evaluations, see the [Telemetry](telemetry) section.
```{toctree} ```{toctree}
:hidden: :hidden:
:maxdepth: 3 :maxdepth: 1
agent_execution_loop
rag
safety
tools
telemetry telemetry
evals
``` ```

View file

@ -0,0 +1,92 @@
## Memory & RAG
Memory enables your applications to reference and recall information from previous interactions or external documents. Llama Stack's memory system is built around the concept of Memory Banks:
1. **Vector Memory Banks**: For semantic search and retrieval
2. **Key-Value Memory Banks**: For structured data storage
3. **Keyword Memory Banks**: For basic text search
4. **Graph Memory Banks**: For relationship-based retrieval
Here's how to set up a vector memory bank for RAG:
```python
# Register a memory bank
bank_id = "my_documents"
response = client.memory_banks.register(
memory_bank_id=bank_id,
params={
"memory_bank_type": "vector",
"embedding_model": "all-MiniLM-L6-v2",
"chunk_size_in_tokens": 512
}
)
# Insert documents
documents = [
{
"document_id": "doc1",
"content": "Your document text here",
"mime_type": "text/plain"
}
]
client.memory.insert(bank_id, documents)
# Query documents
results = client.memory.query(
bank_id=bank_id,
query="What do you know about...",
)
```
### Building RAG-Enhanced Agents
One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
```python
from llama_stack_client.types import Attachment
# Create attachments from documents
attachments = [
Attachment(
content="https://raw.githubusercontent.com/example/doc.rst",
mime_type="text/plain"
)
]
# Configure agent with memory
agent_config = AgentConfig(
model="Llama3.2-3B-Instruct",
instructions="You are a helpful assistant",
tools=[{
"type": "memory",
"memory_bank_configs": [],
"query_generator_config": {"type": "default", "sep": " "},
"max_tokens_in_context": 4096,
"max_chunks": 10
}],
enable_session_persistence=True
)
agent = Agent(client, agent_config)
session_id = agent.create_session("rag_session")
# Initial document ingestion
response = agent.create_turn(
messages=[{
"role": "user",
"content": "I am providing some documents for reference."
}],
attachments=attachments,
session_id=session_id
)
# Query with RAG
response = agent.create_turn(
messages=[{
"role": "user",
"content": "What are the key topics in the documents?"
}],
session_id=session_id
)
```

View file

@ -0,0 +1,21 @@
## Safety Guardrails
Safety is a critical component of any AI application. Llama Stack provides a Shield system that can be applied at multiple touchpoints:
```python
# Register a safety shield
shield_id = "content_safety"
client.shields.register(
shield_id=shield_id,
provider_shield_id="llama-guard-basic"
)
# Run content through shield
response = client.safety.run_shield(
shield_id=shield_id,
messages=[{"role": "user", "content": "User message here"}]
)
if response.violation:
print(f"Safety violation detected: {response.violation.user_message}")
```

View file

@ -1,14 +1,7 @@
# Telemetry ## Telemetry
```{note}
The telemetry system is currently experimental and subject to change. We welcome feedback and contributions to help improve it.
```
The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output. The Llama Stack telemetry system provides comprehensive tracing, metrics, and logging capabilities. It supports multiple sink types including OpenTelemetry, SQLite, and Console output.
## Key Concepts
### Events ### Events
The telemetry system supports three main types of events: The telemetry system supports three main types of events:
@ -44,67 +37,15 @@ structured_log_event = SpanStartPayload(
- **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API. - **SQLite**: Store events in a local SQLite database. This is needed if you want to query the events later through the Llama Stack API.
- **Console**: Print events to the console. - **Console**: Print events to the console.
## APIs ### Providers
The telemetry API is designed to be flexible for different user flows like debugging/visualization in UI, monitoring, and saving traces to datasets. #### Meta-Reference Provider
The telemetry system exposes the following HTTP endpoints:
### Log Event
```http
POST /telemetry/log-event
```
Logs a telemetry event (unstructured log, metric, or structured log) with optional TTL.
### Query Traces
```http
POST /telemetry/query-traces
```
Retrieves traces based on filters with pagination support. Parameters:
- `attribute_filters`: List of conditions to filter traces
- `limit`: Maximum number of traces to return (default: 100)
- `offset`: Number of traces to skip (default: 0)
- `order_by`: List of fields to sort by
### Get Span Tree
```http
POST /telemetry/get-span-tree
```
Retrieves a hierarchical view of spans starting from a specific span. Parameters:
- `span_id`: ID of the root span to retrieve
- `attributes_to_return`: Optional list of specific attributes to include
- `max_depth`: Optional maximum depth of the span tree to return
### Query Spans
```http
POST /telemetry/query-spans
```
Retrieves spans matching specified filters and returns selected attributes. Parameters:
- `attribute_filters`: List of conditions to filter traces
- `attributes_to_return`: List of specific attributes to include in results
- `max_depth`: Optional maximum depth of spans to traverse (default: no limit)
Returns a flattened list of spans with requested attributes.
### Save Spans to Dataset
This is useful for saving traces to a dataset for running evaluations. For example, you can save the input/output of each span that is part of an agent session/turn to a dataset and then run an eval task on it. See example in [Example: Save Spans to Dataset](#example-save-spans-to-dataset).
```http
POST /telemetry/save-spans-to-dataset
```
Queries spans and saves their attributes to a dataset. Parameters:
- `attribute_filters`: List of conditions to filter traces
- `attributes_to_save`: List of span attributes to save to the dataset
- `dataset_id`: ID of the dataset to save to
- `max_depth`: Optional maximum depth of spans to traverse (default: no limit)
## Providers
### Meta-Reference Provider
Currently, only the meta-reference provider is implemented. It can be configured to send events to three sink types: Currently, only the meta-reference provider is implemented. It can be configured to send events to three sink types:
1) OpenTelemetry Collector 1) OpenTelemetry Collector
2) SQLite 2) SQLite
3) Console 3) Console
## Configuration #### Configuration
Here's an example that sends telemetry signals to all three sink types. Your configuration might use only one. Here's an example that sends telemetry signals to all three sink types. Your configuration might use only one.
```yaml ```yaml
@ -117,7 +58,7 @@ Here's an example that sends telemetry signals to all three sink types. Your con
sqlite_db_path: "/path/to/telemetry.db" sqlite_db_path: "/path/to/telemetry.db"
``` ```
## Jaeger to visualize traces ### Jaeger to visualize traces
The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data. The `otel` sink works with any service compatible with the OpenTelemetry collector. Let's use Jaeger to visualize this data.
@ -131,112 +72,6 @@ $ docker run --rm --name jaeger \
Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686/. Once the Jaeger instance is running, you can visualize traces by navigating to http://localhost:16686/.
## Querying Traces Stored in SQLIte ### Querying Traces Stored in SQLite
The `sqlite` sink allows you to query traces without an external system. Here are some example queries: The `sqlite` sink allows you to query traces without an external system. Here are some example queries. Refer to the notebook at [Llama Stack Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) for more examples on how to query traces and spaces.
Querying Traces for a agent session
The client SDK is not updated to support the new telemetry API. It will be updated soon. You can manually query traces using the following curl command:
``` bash
curl -X POST 'http://localhost:8321/alpha/telemetry/query-traces' \
-H 'Content-Type: application/json' \
-d '{
"attribute_filters": [
{
"key": "session_id",
"op": "eq",
"value": "dd667b87-ca4b-4d30-9265-5a0de318fc65" }],
"limit": 100,
"offset": 0,
"order_by": ["start_time"]
[
{
"trace_id": "6902f54b83b4b48be18a6f422b13e16f",
"root_span_id": "5f37b85543afc15a",
"start_time": "2024-12-04T08:08:30.501587",
"end_time": "2024-12-04T08:08:36.026463"
},
........
]
}'
```
Querying spans for a specifc root span id
``` bash
curl -X POST 'http://localhost:8321/alpha/telemetry/get-span-tree' \
-H 'Content-Type: application/json' \
-d '{ "span_id" : "6cceb4b48a156913", "max_depth": 2 }'
{
"span_id": "6cceb4b48a156913",
"trace_id": "dafa796f6aaf925f511c04cd7c67fdda",
"parent_span_id": "892a66d726c7f990",
"name": "retrieve_rag_context",
"start_time": "2024-12-04T09:28:21.781995",
"end_time": "2024-12-04T09:28:21.913352",
"attributes": {
"input": [
"{\"role\":\"system\",\"content\":\"You are a helpful assistant\"}",
"{\"role\":\"user\",\"content\":\"What are the top 5 topics that were explained in the documentation? Only list succinct bullet points.\",\"context\":null}"
]
},
"children": [
{
"span_id": "1a2df181854064a8",
"trace_id": "dafa796f6aaf925f511c04cd7c67fdda",
"parent_span_id": "6cceb4b48a156913",
"name": "MemoryRouter.query_documents",
"start_time": "2024-12-04T09:28:21.787620",
"end_time": "2024-12-04T09:28:21.906512",
"attributes": {
"input": null
},
"children": [],
"status": "ok"
}
],
"status": "ok"
}
```
## Example: Save Spans to Dataset
Save all spans for a specific agent session to a dataset.
``` bash
curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \
-H 'Content-Type: application/json' \
-d '{
"attribute_filters": [
{
"key": "session_id",
"op": "eq",
"value": "dd667b87-ca4b-4d30-9265-5a0de318fc65"
}
],
"attributes_to_save": ["input", "output"],
"dataset_id": "my_dataset",
"max_depth": 10
}'
```
Save all spans for a specific agent turn to a dataset.
```bash
curl -X POST 'http://localhost:8321/alpha/telemetry/save-spans-to-dataset' \
-H 'Content-Type: application/json' \
-d '{
"attribute_filters": [
{
"key": "turn_id",
"op": "eq",
"value": "123e4567-e89b-12d3-a456-426614174000"
}
],
"attributes_to_save": ["input", "output"],
"dataset_id": "my_dataset",
"max_depth": 10
}'
```

View file

@ -0,0 +1,202 @@
# Tools
Tools are functions that can be invoked by an agent to perform tasks. They are organized into tool groups and registered with specific providers. Each tool group represents a collection of related tools from a single provider. They are organized into groups so that state can be externalized: the collection operates on the same state typically.
An example of this would be a "db_access" tool group that contains tools for interacting with a database. "list_tables", "query_table", "insert_row" could be examples of tools in this group.
Tools are treated as any other resource in llama stack like models. You can register them, have providers for them etc.
When instatiating an agent, you can provide it a list of tool groups that it has access to. Agent gets the corresponding tool definitions for the specified tool groups and passes them along to the model.
Refer to the [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb) notebook for more examples on how to use tools.
## Types of Tool Group providers
There are three types of providers for tool groups that are supported by Llama Stack.
1. Built-in providers
2. Model Context Protocol (MCP) providers
3. Client provided tools
### Built-in providers
Built-in providers come packaged with Llama Stack. These providers provide common functionalities like web search, code interpretation, and computational capabilities.
#### Web Search providers
There are three web search providers that are supported by Llama Stack.
1. Brave Search
2. Bing Search
3. Tavily Search
Example client SDK call to register a "websearch" toolgroup that is provided by brave-search.
```python
# Register Brave Search tool group
client.toolgroups.register(
toolgroup_id="builtin::websearch",
provider_id="brave-search",
args={"max_results": 5}
)
```
The tool requires an API key which can be provided either in the configuration or through the request header `X-LlamaStack-Provider-Data`. The format of the header is `{"<provider_name>_api_key": <your api key>}`.
#### Code Interpreter
The Code Interpreter allows execution of Python code within a controlled environment.
```python
# Register Code Interpreter tool group
client.toolgroups.register(
toolgroup_id="builtin::code_interpreter",
provider_id="code_interpreter"
)
```
Features:
- Secure execution environment using `bwrap` sandboxing
- Matplotlib support for generating plots
- Disabled dangerous system operations
- Configurable execution timeouts
#### WolframAlpha
The WolframAlpha tool provides access to computational knowledge through the WolframAlpha API.
```python
# Register WolframAlpha tool group
client.toolgroups.register(
toolgroup_id="builtin::wolfram_alpha",
provider_id="wolfram-alpha"
)
```
Example usage:
```python
result = client.tool_runtime.invoke_tool(
tool_name="wolfram_alpha",
args={"query": "solve x^2 + 2x + 1 = 0"}
)
```
#### Memory
The Memory tool enables retrieval of context from various types of memory banks (vector, key-value, keyword, and graph).
```python
# Register Memory tool group
client.toolgroups.register(
toolgroup_id="builtin::memory",
provider_id="memory",
args={
"max_chunks": 5,
"max_tokens_in_context": 4096
}
)
```
Features:
- Support for multiple memory bank types
- Configurable query generation
- Context retrieval with token limits
> **Note:** By default, llama stack run.yaml defines toolgroups for web search, code interpreter and memory, that are provided by tavily-search, code-interpreter and memory providers.
## Model Context Protocol (MCP) Tools
MCP tools are special tools that can interact with llama stack over model context protocol. These tools are dynamically discovered from an MCP endpoint and can be used to extend the agent's capabilities.
Refer to https://github.com/modelcontextprotocol/server for available MCP servers.
```python
# Register MCP tools
client.toolgroups.register(
toolgroup_id="builtin::filesystem",
provider_id="model-context-protocol",
mcp_endpoint=URL(uri="http://localhost:8000/sse"),
)
```
MCP tools require:
- A valid MCP endpoint URL
- The endpoint must implement the Model Context Protocol
- Tools are discovered dynamically from the endpoint
## Tools provided by the client
These tools are registered along with the agent config and are specific to the agent for which they are registered. The main difference between these tools and the tools provided by the built-in providers is that the execution of these tools is handled by the client and the agent transfers the tool call to the client and waits for the result from the client.
```python
# Example agent config with client provided tools
config = AgentConfig(
toolgroups=[
"builtin::websearch",
],
client_tools=[
ToolDef(name="client_tool", description="Client provided tool")
]
)
```
Refer to [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/e2e_loop_with_client_tools.py) for an example of how to use client provided tools.
## Tool Structure
Each tool has the following components:
- `name`: Unique identifier for the tool
- `description`: Human-readable description of the tool's functionality
- `parameters`: List of parameters the tool accepts
- `name`: Parameter name
- `parameter_type`: Data type (string, number, etc.)
- `description`: Parameter description
- `required`: Whether the parameter is required (default: true)
- `default`: Default value if any
Example tool definition:
```python
{
"name": "web_search",
"description": "Search the web for information",
"parameters": [
{
"name": "query",
"parameter_type": "string",
"description": "The query to search for",
"required": True
}
]
}
```
## Tool Invocation
Tools can be invoked using the `invoke_tool` method:
```python
result = client.tool_runtime.invoke_tool(
tool_name="web_search",
kwargs={"query": "What is the capital of France?"}
)
```
The result contains:
- `content`: The tool's output
- `error_message`: Optional error message if the tool failed
- `error_code`: Optional error code if the tool failed
## Listing Available Tools
You can list all available tools or filter by tool group:
```python
# List all tools
all_tools = client.tools.list_tools()
# List tools in a specific group
group_tools = client.tools.list_tools(toolgroup_id="search_tools")
```

View file

@ -10,7 +10,6 @@ A Llama Stack API is described as a collection of REST endpoints. We currently s
- **Inference**: run inference with a LLM - **Inference**: run inference with a LLM
- **Safety**: apply safety policies to the output at a Systems (not only model) level - **Safety**: apply safety policies to the output at a Systems (not only model) level
- **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc. - **Agents**: run multi-step agentic workflows with LLMs with tool usage, memory (RAG), etc.
- **Memory**: store and retrieve data for RAG, chat history, etc.
- **DatasetIO**: interface with datasets and data loaders - **DatasetIO**: interface with datasets and data loaders
- **Scoring**: evaluate outputs of the system - **Scoring**: evaluate outputs of the system
- **Eval**: generate outputs (via Inference or Agents) and perform scoring - **Eval**: generate outputs (via Inference or Agents) and perform scoring
@ -24,22 +23,23 @@ We are working on adding a few more APIs to complete the application lifecycle.
## API Providers ## API Providers
The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Obvious examples for these include The goal of Llama Stack is to build an ecosystem where users can easily swap out different implementations for the same API. Examples for these include:
- LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, etc.), - LLM inference providers (e.g., Fireworks, Together, AWS Bedrock, Groq, Cerebras, SambaNova, etc.),
- Vector databases (e.g., ChromaDB, Weaviate, Qdrant, etc.), - Vector databases (e.g., ChromaDB, Weaviate, Qdrant, FAISS, PGVector, etc.),
- Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.) - Safety providers (e.g., Meta's Llama Guard, AWS Bedrock Guardrails, etc.)
Providers come in two flavors: Providers come in two flavors:
- **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code. - **Remote**: the provider runs as a separate service external to the Llama Stack codebase. Llama Stack contains a small amount of adapter code.
- **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack. - **Inline**: the provider is fully specified and implemented within the Llama Stack codebase. It may be a simple wrapper around an existing library, or a full fledged implementation within Llama Stack.
Most importantly, Llama Stack always strives to provide at least one fully "local" provider for each API so you can iterate on a fully featured environment locally.
## Resources ## Resources
Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources: Some of these APIs are associated with a set of **Resources**. Here is the mapping of APIs to resources:
- **Inference**, **Eval** and **Post Training** are associated with `Model` resources. - **Inference**, **Eval** and **Post Training** are associated with `Model` resources.
- **Safety** is associated with `Shield` resources. - **Safety** is associated with `Shield` resources.
- **Memory** is associated with `Memory Bank` resources. - **Tool Runtime** is associated with `ToolGroup` resources.
- **DatasetIO** is associated with `Dataset` resources. - **DatasetIO** is associated with `Dataset` resources.
- **Scoring** is associated with `ScoringFunction` resources. - **Scoring** is associated with `ScoringFunction` resources.
- **Eval** is associated with `Model` and `EvalTask` resources. - **Eval** is associated with `Model` and `EvalTask` resources.
@ -58,17 +58,14 @@ While there is a lot of flexibility to mix-and-match providers, often users will
**Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions. **Remotely Hosted Distro**: These are the simplest to consume from a user perspective. You can simply obtain the API key for these providers, point to a URL and have _all_ Llama Stack APIs working out of the box. Currently, [Fireworks](https://fireworks.ai/) and [Together](https://together.xyz/) provide such easy-to-consume Llama Stack distributions.
**Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Cerebras, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros. **Locally Hosted Distro**: You may want to run Llama Stack on your own hardware. Typically though, you still need to use Inference via an external service. You can use providers like HuggingFace TGI, Fireworks, Together, etc. for this purpose. Or you may have access to GPUs and can run a [vLLM](https://github.com/vllm-project/vllm) or [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) instance. If you "just" have a regular desktop machine, you can use [Ollama](https://ollama.com/) for inference. To provide convenient quick access to these options, we provide a number of such pre-configured locally-hosted Distros.
**On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.) **On-device Distro**: Finally, you may want to run Llama Stack directly on an edge device (mobile phone or a tablet.) We provide Distros for iOS and Android (coming soon.)
## More Concepts
- [Evaluation Concepts](evaluation_concepts.md)
```{toctree} ```{toctree}
:maxdepth: 1 :maxdepth: 1
:hidden: :hidden:
evaluation_concepts distributions/index
``` ```

View file

@ -1,9 +1,14 @@
# Contributing to Llama Stack # Contributing to Llama Stack
Start with the [Contributing Guide](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md) for some general tips. This section covers a few key topics in more detail.
- [Adding a New API Provider](new_api_provider.md) describes adding new API providers to the Stack.
- [Testing Llama Stack](testing.md) provides details about the testing framework and how to test providers and distributions.
```{toctree} ```{toctree}
:maxdepth: 1 :maxdepth: 1
:hidden:
new_api_provider new_api_provider
memory_api testing
``` ```

View file

@ -1,53 +0,0 @@
# Memory API Providers
This guide gives you references to switch between different memory API providers.
##### pgvector
1. Start running the pgvector server:
```
$ docker run --network host --name mypostgres -it -p 5432:5432 -e POSTGRES_PASSWORD=mysecretpassword -e POSTGRES_USER=postgres -e POSTGRES_DB=postgres pgvector/pgvector:pg16
```
2. Edit the `run.yaml` file to point to the pgvector server.
```
memory:
- provider_id: pgvector
provider_type: remote::pgvector
config:
host: 127.0.0.1
port: 5432
db: postgres
user: postgres
password: mysecretpassword
```
> [!NOTE]
> If you get a `RuntimeError: Vector extension is not installed.`. You will need to run `CREATE EXTENSION IF NOT EXISTS vector;` to include the vector extension. E.g.
```
docker exec -it mypostgres ./bin/psql -U postgres
postgres=# CREATE EXTENSION IF NOT EXISTS vector;
postgres=# SELECT extname from pg_extension;
extname
```
3. Run `docker compose up` with the updated `run.yaml` file.
##### chromadb
1. Start running chromadb server
```
docker run -it --network host --name chromadb -p 6000:6000 -v ./chroma_vdb:/chroma/chroma -e IS_PERSISTENT=TRUE chromadb/chroma:latest
```
2. Edit the `run.yaml` file to point to the chromadb server.
```
memory:
- provider_id: remote::chromadb
provider_type: remote::chromadb
config:
host: localhost
port: 6000
```
3. Run `docker compose up` with the updated `run.yaml` file.

View file

@ -1,26 +1,41 @@
# Adding a New API Provider # Adding a New API Provider
This guide contains references to walk you through adding a new API provider. This guide will walk you through the process of adding a new API provider to Llama Stack.
1. First, decide which API your provider falls into (e.g. Inference, Safety, Agents, Memory).
2. Decide whether your provider is a remote provider, or inline implementation. A remote provider is a provider that makes a remote request to a service. An inline provider is a provider where implementation is executed locally. Checkout the examples, and follow the structure to add your own API provider. Please find the following code pointers:
- {repopath}`Remote Providers::llama_stack/providers/remote` - Begin by reviewing the [core concepts](../concepts/) of Llama Stack and choose the API your provider belongs to (Inference, Safety, VectorIO, etc.)
- {repopath}`Inline Providers::llama_stack/providers/inline` - Determine the provider type ({repopath}`Remote::llama_stack/providers/remote` or {repopath}`Inline::llama_stack/providers/inline`). Remote providers make requests to external services, while inline providers execute implementation locally.
- Add your provider to the appropriate {repopath}`Registry::llama_stack/providers/registry/`. Specify pip dependencies necessary.
- Update any distribution {repopath}`Templates::llama_stack/templates/` build.yaml and run.yaml files if they should include your provider by default. Run {repopath}`llama_stack/scripts/distro_codegen.py` if necessary.
3. [Build a Llama Stack distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html) with your API provider.
4. Test your code!
## Testing your newly added API providers Here are some example PRs to help you get started:
- [Grok Inference Implementation](https://github.com/meta-llama/llama-stack/pull/609)
- [Nvidia Inference Implementation](https://github.com/meta-llama/llama-stack/pull/355)
- [Model context protocol Tool Runtime](https://github.com/meta-llama/llama-stack/pull/665)
1. Start with an _integration test_ for your provider. That means we will instantiate the real provider, pass it real configuration and if it is a remote service, we will actually hit the remote service. We **strongly** discourage mocking for these tests at the provider level. Llama Stack is first and foremost about integration so we need to make sure stuff works end-to-end. See {repopath}`llama_stack/providers/tests/inference/test_text_inference.py` for an example.
2. In addition, if you want to unit test functionality within your provider, feel free to do so. You can find some tests in `tests/` but they aren't well-supported so far. ## Testing the Provider
3. Test with a client-server Llama Stack setup. (a) Start a Llama Stack server with your own distribution which includes the new provider. (b) Send a client request to the server. See `llama_stack/apis/<api>/client.py` for how this is done. These client scripts can serve as lightweight tests. ### 1. Integration Testing
- Create integration tests that use real provider instances and configurations
- For remote services, test actual API interactions
- Avoid mocking at the provider level since adapter layers tend to be thin
- Reference examples in {repopath}`tests/client-sdk`
You can find more complex client scripts [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repo. Note down which scripts works and do not work with your distribution. ### 2. Unit Testing (Optional)
- Add unit tests for provider-specific functionality
- See examples in {repopath}`llama_stack/providers/tests/inference/test_text_inference.py`
## Submit your PR ### 3. End-to-End Testing
1. Start a Llama Stack server with your new provider
2. Test using client requests
3. Verify compatibility with existing client scripts in the [llama-stack-apps](https://github.com/meta-llama/llama-stack-apps/tree/main) repository
4. Document which scripts are compatible with your provider
After you have fully tested your newly added API provider, submit a PR with the attached test plan. You must have a Test Plan in the summary section of your PR. ## Submitting Your PR
1. Ensure all tests pass
2. Include a comprehensive test plan in your PR summary
3. Document any known limitations or considerations
4. Submit your pull request for review

View file

@ -0,0 +1,6 @@
# Testing Llama Stack
Tests are of three different kinds:
- Unit tests
- Provider focused integration tests
- Client SDK tests

View file

@ -4,7 +4,7 @@
This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers. This guide will walk you through the steps to get started with building a Llama Stack distribution from scratch with your choice of API providers.
## Llama Stack Build ### Llama Stack Build
In order to build your own distribution, we recommend you clone the `llama-stack` repository. In order to build your own distribution, we recommend you clone the `llama-stack` repository.
@ -13,29 +13,99 @@ In order to build your own distribution, we recommend you clone the `llama-stack
git clone git@github.com:meta-llama/llama-stack.git git clone git@github.com:meta-llama/llama-stack.git
cd llama-stack cd llama-stack
pip install -e . pip install -e .
llama stack build -h
``` ```
Use the CLI to build your distribution.
The main points to consider are:
1. **Image Type** - Do you want a Conda / venv environment or a Container (eg. Docker)
2. **Template** - Do you want to use a template to build your distribution? or start from scratch ?
3. **Config** - Do you want to use a pre-existing config file to build your distribution?
We will start build our distribution (in the form of a Conda environment, or Docker image). In this step, we will specify: ```
- `name`: the name for our distribution (e.g. `my-stack`) llama stack build -h
- `image_type`: our build image type (`conda | docker`)
- `distribution_spec`: our distribution specs for specifying API providers usage: llama stack build [-h] [--config CONFIG] [--template TEMPLATE] [--list-templates | --no-list-templates] [--image-type {conda,container,venv}] [--image-name IMAGE_NAME]
- `description`: a short description of the configurations for the distribution
- `providers`: specifies the underlying implementation for serving each API endpoint Build a Llama stack container
- `image_type`: `conda` | `docker` to specify whether to build the distribution in the form of Docker image or Conda environment.
options:
-h, --help show this help message and exit
--config CONFIG Path to a config file to use for the build. You can find example configs in llama_stack/distribution/**/build.yaml.
If this argument is not provided, you will be prompted to enter information interactively
--template TEMPLATE Name of the example template config to use for build. You may use `llama stack build --list-templates` to check out the available templates
--list-templates, --no-list-templates
Show the available templates for building a Llama Stack distribution (default: False)
--image-type {conda,container,venv}
Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.
--image-name IMAGE_NAME
[for image-type=conda] Name of the conda environment to use for the build. If
not specified, currently active Conda environment will be used. If no Conda
environment is active, you must specify a name.
```
After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command. After this step is complete, a file named `<name>-build.yaml` and template file `<name>-run.yaml` will be generated and saved at the output file path specified at the end of the command.
::::{tab-set} ::::{tab-set}
:::{tab-item} Building from a template
To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
The following command will allow you to see the available templates and their corresponding providers.
```
llama stack build --list-templates
```
```
------------------------------+-----------------------------------------------------------------------------+
| Template Name | Description |
+------------------------------+-----------------------------------------------------------------------------+
| hf-serverless | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| together | Use Together.AI for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| vllm-gpu | Use a built-in vLLM engine for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| experimental-post-training | Experimental template for post training |
+------------------------------+-----------------------------------------------------------------------------+
| remote-vllm | Use (an external) vLLM server for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| fireworks | Use Fireworks.AI for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| tgi | Use (an external) TGI server for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| bedrock | Use AWS Bedrock for running LLM inference and safety |
+------------------------------+-----------------------------------------------------------------------------+
| meta-reference-gpu | Use Meta Reference for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| nvidia | Use NVIDIA NIM for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| meta-reference-quantized-gpu | Use Meta Reference with fp8, int4 quantization for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| cerebras | Use Cerebras for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| ollama | Use (an external) Ollama server for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
| hf-endpoint | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
+------------------------------+-----------------------------------------------------------------------------+
```
You may then pick a template to build your distribution with providers fitted to your liking.
For example, to build a distribution with TGI as the inference provider, you can run:
```
$ llama stack build --template tgi
...
You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml`
```
:::
:::{tab-item} Building from Scratch :::{tab-item} Building from Scratch
- For a new user, we could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations. If the provided templates do not fit your use case, you could start off with running `llama stack build` which will allow you to a interactively enter wizard where you will be prompted to enter build configurations.
It would be best to start with a template and understand the structure of the config file and the various concepts ( APIS, providers, resources, etc.) before starting from scratch.
``` ```
llama stack build llama stack build
> Enter a name for your Llama Stack (e.g. my-local-stack): my-stack > Enter a name for your Llama Stack (e.g. my-local-stack): my-stack
> Enter the image type you want your Llama Stack to be built as (docker or conda): conda > Enter the image type you want your Llama Stack to be built as (container or conda): conda
Llama Stack is composed of several APIs working together. Let's select Llama Stack is composed of several APIs working together. Let's select
the provider types (implementations) you want to use for these APIs. the provider types (implementations) you want to use for these APIs.
@ -57,272 +127,6 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack
``` ```
::: :::
:::{tab-item} Building from a template
- To build from alternative API providers, we provide distribution templates for users to get started building a distribution backed by different providers.
The following command will allow you to see the available templates and their corresponding providers.
```
llama stack build --list-templates
```
```
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| Template Name | Providers | Description |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| tgi | { | Use (an external) TGI server for running LLM inference |
| | "inference": [ | |
| | "remote::tgi" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| remote-vllm | { | Use (an external) vLLM server for running LLM inference |
| | "inference": [ | |
| | "remote::vllm" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| vllm-gpu | { | Use a built-in vLLM engine for running LLM inference |
| | "inference": [ | |
| | "inline::vllm" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| meta-reference-quantized-gpu | { | Use Meta Reference with fp8, int4 quantization for running LLM inference |
| | "inference": [ | |
| | "inline::meta-reference-quantized" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| meta-reference-gpu | { | Use Meta Reference for running LLM inference |
| | "inference": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| hf-serverless | { | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
| | "inference": [ | |
| | "remote::hf::serverless" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| together | { | Use Together.AI for running LLM inference |
| | "inference": [ | |
| | "remote::together" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| ollama | { | Use (an external) Ollama server for running LLM inference |
| | "inference": [ | |
| | "remote::ollama" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| bedrock | { | Use AWS Bedrock for running LLM inference and safety |
| | "inference": [ | |
| | "remote::bedrock" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "remote::bedrock" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| hf-endpoint | { | Use (an external) Hugging Face Inference Endpoint for running LLM inference |
| | "inference": [ | |
| | "remote::hf::endpoint" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| fireworks | { | Use Fireworks.AI for running LLM inference |
| | "inference": [ | |
| | "remote::fireworks" | |
| | ], | |
| | "memory": [ | |
| | "inline::faiss", | |
| | "remote::chromadb", | |
| | "remote::pgvector" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
| cerebras | { | Use Cerebras for running LLM inference |
| | "inference": [ | |
| | "remote::cerebras" | |
| | ], | |
| | "safety": [ | |
| | "inline::llama-guard" | |
| | ], | |
| | "memory": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "agents": [ | |
| | "inline::meta-reference" | |
| | ], | |
| | "telemetry": [ | |
| | "inline::meta-reference" | |
| | ] | |
| | } | |
+------------------------------+----------------------------------------+-----------------------------------------------------------------------------+
```
You may then pick a template to build your distribution with providers fitted to your liking.
For example, to build a distribution with TGI as the inference provider, you can run:
```
llama stack build --template tgi
```
```
$ llama stack build --template tgi
...
You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml`
```
:::
:::{tab-item} Building from a pre-existing build config file :::{tab-item} Building from a pre-existing build config file
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command. - In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
@ -348,35 +152,39 @@ llama stack build --config llama_stack/templates/ollama/build.yaml
``` ```
::: :::
:::{tab-item} Building Docker :::{tab-item} Building Container
> [!TIP] > [!TIP]
> Podman is supported as an alternative to Docker. Set `DOCKER_BINARY` to `podman` in your environment to use Podman. > Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podman` in your environment to use Podman.
To build a docker image, you may start off from a template and use the `--image-type docker` flag to specify `docker` as the build image type. To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type.
``` ```
llama stack build --template ollama --image-type docker llama stack build --template ollama --image-type container
``` ```
``` ```
$ llama stack build --template ollama --image-type docker $ llama stack build --template ollama --image-type container
... ...
Dockerfile created successfully in /tmp/tmp.viA3a3Rdsg/DockerfileFROM python:3.10-slim Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim
... ...
You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml` You can now edit ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml and run `llama stack run ~/meta-llama/llama-stack/tmp/configs/ollama-run.yaml`
``` ```
After this step is successful, you should be able to find the built docker image and test it with `llama stack run <path/to/run.yaml>`. After this step is successful, you should be able to find the built container image and test it with `llama stack run <path/to/run.yaml>`.
::: :::
:::: ::::
## Running your Stack server ### Running your Stack server
Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step. Now, let's start the Llama Stack Distribution Server. You will need the YAML configuration file which was written out at the end by the `llama stack build` step.
``` ```
# Start using template name
llama stack run tgi
# Start using config file
llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml llama stack run ~/.llama/distributions/llamastack-my-local-stack/my-local-stack-run.yaml
``` ```
@ -412,4 +220,4 @@ INFO: 2401:db00:35c:2d2b:face:0:c9:0:54678 - "GET /models/list HTTP/1.1" 200
### Troubleshooting ### Troubleshooting
If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue. If you encounter any issues, ask questions in our discord or search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.

View file

@ -70,20 +70,27 @@ Next up is the most critical part: the set of providers that the stack will use
```yaml ```yaml
providers: providers:
inference: inference:
# provider_id is a string you can choose freely
- provider_id: ollama - provider_id: ollama
# provider_type is a string that specifies the type of provider.
# in this case, the provider for inference is ollama and it is run remotely (outside of the distribution)
provider_type: remote::ollama provider_type: remote::ollama
# config is a dictionary that contains the configuration for the provider.
# in this case, the configuration is the url of the ollama server
config: config:
url: ${env.OLLAMA_URL:http://localhost:11434} url: ${env.OLLAMA_URL:http://localhost:11434}
``` ```
A few things to note: A few things to note:
- A _provider instance_ is identified with an (identifier, type, configuration) tuple. The identifier is a string you can choose freely. - A _provider instance_ is identified with an (id, type, configuration) triplet.
- The id is a string you can choose freely.
- You can instantiate any number of provider instances of the same type. - You can instantiate any number of provider instances of the same type.
- The configuration dictionary is provider-specific. Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value. - The configuration dictionary is provider-specific.
- Notice that configuration can reference environment variables (with default values), which are expanded at runtime. When you run a stack server (via docker or via `llama stack run`), you can specify `--env OLLAMA_URL=http://my-server:11434` to override the default value.
## Resources ## Resources
```
Finally, let's look at the `models` section: Finally, let's look at the `models` section:
```yaml ```yaml
models: models:
- metadata: {} - metadata: {}

View file

@ -1,11 +1,20 @@
# Using Llama Stack as a Library # Using Llama Stack as a Library
If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server. For [example](https://github.com/meta-llama/llama-stack-client-python/blob/main/src/llama_stack_client/lib/direct/test.py): If you are planning to use an external service for Inference (even Ollama or TGI counts as external), it is often easier to use Llama Stack as a library. This avoids the overhead of setting up a server.
```python
# setup
pip install llama-stack
llama stack build --template together --image-type venv
```
```python ```python
from llama_stack_client.lib.direct.direct import LlamaStackDirectClient from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
client = await LlamaStackDirectClient.from_template('ollama') client = LlamaStackAsLibraryClient(
"ollama",
# provider_data is optional, but if you need to pass in any provider specific data, you can do so here.
provider_data = {"tavily_search_api_key": os.environ['TAVILY_SEARCH_API_KEY']}
)
await client.initialize() await client.initialize()
``` ```
@ -14,23 +23,12 @@ This will parse your config and set up any inline implementations and remote cli
Then, you can access the APIs like `models` and `inference` on the client and call their methods directly: Then, you can access the APIs like `models` and `inference` on the client and call their methods directly:
```python ```python
response = await client.models.list() response = client.models.list()
print(response)
```
```python
response = await client.inference.chat_completion(
messages=[UserMessage(content="What is the capital of France?", role="user")],
model_id="Llama3.1-8B-Instruct",
stream=False,
)
print("\nChat completion response:")
print(response)
``` ```
If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html), you can also use the run.yaml configuration file directly: If you've created a [custom distribution](https://llama-stack.readthedocs.io/en/latest/distributions/building_distro.html), you can also use the run.yaml configuration file directly:
```python ```python
client = await LlamaStackDirectClient.from_config(config_path) client = LlamaStackAsLibraryClient(config_path)
await client.initialize() client.initialize()
``` ```

View file

@ -1,41 +1,27 @@
# Starting a Llama Stack # Starting a Llama Stack Server
You can run a Llama Stack server in one of the following ways:
**As a Library**:
This is the simplest way to get started. Using Llama Stack as a library means you do not need to start a server. This is especially useful when you are not running inference locally and relying on an external inference service (eg. fireworks, together, groq, etc.) See [Using Llama Stack as a Library](importing_as_library)
**Docker**:
Another simple way to start interacting with Llama Stack is to just spin up docker which is pre-built with all the providers you need. We provide a number of pre-built Docker containers so you can start a Llama Stack server instantly. You can also build your own custom Docker container. Which distribution to choose depends on the hardware you have. See [Selection of a Distribution](distributions/selection) for more details.
**Conda**:
Lastly, if you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
```{toctree} ```{toctree}
:maxdepth: 3 :maxdepth: 1
:hidden: :hidden:
importing_as_library importing_as_library
building_distro building_distro
configuration configuration
``` ```
You can instantiate a Llama Stack in one of the following ways:
- **As a Library**: this is the simplest, especially if you are using an external inference service. See [Using Llama Stack as a Library](importing_as_library)
- **Docker**: we provide a number of pre-built Docker containers so you can start a Llama Stack server instantly. You can also build your own custom Docker container.
- **Conda**: finally, you can build a custom Llama Stack server using `llama stack build` containing the exact combination of providers you wish. We have provided various templates to make getting started easier.
Which templates / distributions to choose depends on the hardware you have for running LLM inference.
- **Do you have access to a machine with powerful GPUs?**
If so, we suggest:
- {dockerhub}`distribution-remote-vllm` ([Guide](self_hosted_distro/remote-vllm))
- {dockerhub}`distribution-meta-reference-gpu` ([Guide](self_hosted_distro/meta-reference-gpu))
- {dockerhub}`distribution-tgi` ([Guide](self_hosted_distro/tgi))
- {dockerhub} `distribution-nvidia` ([Guide](self_hosted_distro/nvidia))
- **Are you running on a "regular" desktop machine?**
If so, we suggest:
- {dockerhub}`distribution-ollama` ([Guide](self_hosted_distro/ollama))
- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest:
- {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together))
- {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks))
- **Do you want to run Llama Stack inference on your iOS / Android device** If so, we suggest:
- [iOS SDK](ondevice_distro/ios_sdk)
- [Android](ondevice_distro/android_sdk)
- **Do you want a hosted Llama Stack endpoint?** If so, we suggest:
- [Remote-Hosted Llama Stack Endpoints](remote_hosted_distro/index)
You can also build your own [custom distribution](building_distro).

View file

@ -1,6 +1,3 @@
---
orphan: true
---
# iOS SDK # iOS SDK
We offer both remote and on-device use of Llama Stack in Swift via two components: We offer both remote and on-device use of Llama Stack in Swift via two components:

View file

@ -1,6 +1,3 @@
---
orphan: true
---
# Remote-Hosted Distributions # Remote-Hosted Distributions
Remote-Hosted distributions are available endpoints serving Llama Stack API that you can directly connect to. Remote-Hosted distributions are available endpoints serving Llama Stack API that you can directly connect to.

View file

@ -8,11 +8,11 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::nvidia` | | inference | `remote::nvidia` |
| memory | `inline::faiss` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss` |
### Environment Variables ### Environment Variables

View file

@ -0,0 +1,56 @@
# List of Distributions
Here are a list of distributions you can use to start a Llama Stack server that are provided out of the box.
## Selection of a Distribution / Template
Which templates / distributions to choose depends on the hardware you have for running LLM inference.
- **Do you want a hosted Llama Stack endpoint?** If so, we suggest leveraging our partners who host Llama Stack endpoints. Namely, _fireworks.ai_ and _together.xyz_.
- Read more about it here - [Remote-Hosted Endpoints](remote_hosted_distro/index).
- **Do you have access to machines with GPUs?** If you wish to run Llama Stack locally or on a cloud instance and host your own Llama Stack endpoint, we suggest:
- {dockerhub}`distribution-remote-vllm` ([Guide](self_hosted_distro/remote-vllm))
- {dockerhub}`distribution-meta-reference-gpu` ([Guide](self_hosted_distro/meta-reference-gpu))
- {dockerhub}`distribution-tgi` ([Guide](self_hosted_distro/tgi))
- {dockerhub}`distribution-nvidia` ([Guide](self_hosted_distro/nvidia))
- **Are you running on a "regular" desktop or laptop ?** We suggest using the ollama template for quick prototyping and get started without having to worry about needing GPUs.
- {dockerhub}`distribution-ollama` ([link](self_hosted_distro/ollama))
- **Do you have an API key for a remote inference provider like Fireworks, Together, etc.?** If so, we suggest:
- {dockerhub}`distribution-together` ([Guide](self_hosted_distro/together))
- {dockerhub}`distribution-fireworks` ([Guide](self_hosted_distro/fireworks))
- **Do you want to run Llama Stack inference on your iOS / Android device** Lastly, we also provide templates for running Llama Stack inference on your iOS / Android device:
- [iOS SDK](ondevice_distro/ios_sdk)
- [Android](ondevice_distro/android_sdk)
- **If none of the above fit your needs, you can also build your own [custom distribution](building_distro).**
### Distribution Details
```{toctree}
:maxdepth: 1
remote_hosted_distro/index
self_hosted_distro/remote-vllm
self_hosted_distro/meta-reference-gpu
self_hosted_distro/tgi
self_hosted_distro/nvidia
self_hosted_distro/ollama
self_hosted_distro/together
self_hosted_distro/fireworks
ondevice_distro/index
```
### On-Device Distributions
```{toctree}
:maxdepth: 1
ondevice_distro/ios_sdk
ondevice_distro/android_sdk
```

View file

@ -15,11 +15,11 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::bedrock` | | inference | `remote::bedrock` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `remote::bedrock` | | safety | `remote::bedrock` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |

View file

@ -8,11 +8,11 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::cerebras` | | inference | `remote::cerebras` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
### Environment Variables ### Environment Variables

View file

@ -18,11 +18,11 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::fireworks` | | inference | `remote::fireworks` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
### Environment Variables ### Environment Variables

View file

@ -18,11 +18,11 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `inline::meta-reference` | | inference | `inline::meta-reference` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs. Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.

View file

@ -18,11 +18,11 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `inline::meta-reference-quantized` | | inference | `inline::meta-reference-quantized` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc. The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.

View file

@ -18,11 +18,11 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::ollama` | | inference | `remote::ollama` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables
@ -82,11 +82,15 @@ docker run \
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
# You need a local checkout of llama-stack to run this, get it using
# git clone https://github.com/meta-llama/llama-stack.git
cd /path/to/llama-stack
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ~/.llama:/root/.llama \ -v ~/.llama:/root/.llama \
-v ./run-with-safety.yaml:/root/my-run.yaml \ -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-ollama \ llamastack/distribution-ollama \
--yaml-config /root/my-run.yaml \ --yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \

View file

@ -14,11 +14,14 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
| API | Provider(s) | | API | Provider(s) |
|-----|-------------| |-----|-------------|
| agents | `inline::meta-reference` | | agents | `inline::meta-reference` |
| datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` |
| inference | `remote::vllm` | | inference | `remote::vllm` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference. You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
@ -107,10 +110,15 @@ If you are using Llama Stack Safety / Shield APIs, use:
export SAFETY_PORT=8081 export SAFETY_PORT=8081
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
# You need a local checkout of llama-stack to run this, get it using
# git clone https://github.com/meta-llama/llama-stack.git
cd /path/to/llama-stack
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \ -v ~/.llama:/root/.llama \
-v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-remote-vllm \ llamastack/distribution-remote-vllm \
--yaml-config /root/my-run.yaml \ --yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \

View file

@ -0,0 +1,75 @@
---
orphan: true
---
# SambaNova Distribution
```{toctree}
:maxdepth: 2
:hidden:
self
```
The `llamastack/distribution-sambanova` distribution consists of the following provider configurations.
| API | Provider(s) |
|-----|-------------|
| agents | `inline::meta-reference` |
| inference | `remote::sambanova` |
| safety | `inline::llama-guard` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
### Environment Variables
The following environment variables can be configured:
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
### Models
The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct (Meta-Llama-3.1-8B-Instruct)`
- `meta-llama/Llama-3.1-70B-Instruct (Meta-Llama-3.1-70B-Instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (Meta-Llama-3.1-405B-Instruct)`
- `meta-llama/Llama-3.2-1B-Instruct (Meta-Llama-3.2-1B-Instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (Meta-Llama-3.2-3B-Instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (Llama-3.2-11B-Vision-Instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (Llama-3.2-90B-Vision-Instruct)`
### Prerequisite: API Keys
Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaBova.ai](https://sambanova.ai/).
## Running Llama Stack with SambaNova
You can do this via Conda (build code) or Docker which has a pre-built image.
### Via Docker
This method allows you to get started quickly without having to build the distribution code.
```bash
LLAMA_STACK_PORT=5001
docker run \
-it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
llamastack/distribution-sambanova \
--port $LLAMA_STACK_PORT \
--env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
```
### Via Conda
```bash
llama stack build --template sambanova --image-type conda
llama stack run ./run.yaml \
--port $LLAMA_STACK_PORT \
--env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY
```

View file

@ -19,11 +19,11 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::tgi` | | inference | `remote::tgi` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference. You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference.
@ -102,10 +102,15 @@ docker run \
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
# You need a local checkout of llama-stack to run this, get it using
# git clone https://github.com/meta-llama/llama-stack.git
cd /path/to/llama-stack
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
-v ./run-with-safety.yaml:/root/my-run.yaml \ -v ~/.llama:/root/.llama \
-v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \
llamastack/distribution-tgi \ llamastack/distribution-tgi \
--yaml-config /root/my-run.yaml \ --yaml-config /root/my-run.yaml \
--port $LLAMA_STACK_PORT \ --port $LLAMA_STACK_PORT \

View file

@ -18,11 +18,11 @@ The `llamastack/distribution-together` distribution consists of the following pr
| datasetio | `remote::huggingface`, `inline::localfs` | | datasetio | `remote::huggingface`, `inline::localfs` |
| eval | `inline::meta-reference` | | eval | `inline::meta-reference` |
| inference | `remote::together` | | inference | `remote::together` |
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
| safety | `inline::llama-guard` | | safety | `inline::llama-guard` |
| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` |
| telemetry | `inline::meta-reference` | | telemetry | `inline::meta-reference` |
| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::memory-runtime` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` |
| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
### Environment Variables ### Environment Variables

View file

@ -1,33 +1,49 @@
# Quick Start # Quick Start
In this guide, we'll walk through how you can use the Llama Stack client SDK to build a simple RAG agent. In this guide, we'll walk through how you can use the Llama Stack (server and client SDK ) to test a simple RAG agent.
The most critical requirement for running the agent is running inference on the underlying Llama model. Depending on what hardware (GPUs) you have available, you have various options. We will use `Ollama` for this purpose as it is the easiest to get started with and yet robust. A Llama Stack agent is a simple integrated system that can perform tasks by combining a Llama model for reasoning with tools (e.g., RAG, web search, code execution, etc.) for taking actions.
First, let's set up some environment variables that we will use in the rest of the guide. Note that if you open up a new terminal, you will need to set these again. In Llama Stack, we provide a server exposing multiple APIs. These APIs are backed by implementations from different providers. For this guide, we will use [Ollama](https://ollama.com/) as the inference provider.
```bash
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
# ollama names this model differently, and we must use the ollama name when loading the model
export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
export LLAMA_STACK_PORT=5001
```
### 1. Start Ollama ### 1. Start Ollama
```bash ```bash
ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m ollama run llama3.2:3b-instruct-fp16 --keepalive 60m
``` ```
By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime. By default, Ollama keeps the model loaded in memory for 5 minutes which can be too short. We set the `--keepalive` flag to 60 minutes to ensure the model remains loaded for sometime.
NOTE: If you do not have ollama, you can install it from [here](https://ollama.ai/docs/installation).
### 2. Start the Llama Stack server
Llama Stack is based on a client-server architecture. It consists of a server which can be configured very flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Memory, Agents, Telemetry, Evals and so forth.
### 2. Pick a client environment
Llama Stack has a service-oriented architecture, so every interaction with the Stack happens through an REST interface. You can interact with the Stack in two ways:
* Install the `llama-stack-client` PyPI package and point `LlamaStackClient` to a local or remote Llama Stack server.
* Or, install the `llama-stack` PyPI package and use the Stack as a library using `LlamaStackAsLibraryClient`.
```{admonition} Note
:class: tip
The API is **exactly identical** for both clients.
```
:::{dropdown} Starting up the Llama Stack server
The Llama Stack server can be configured flexibly so you can mix-and-match various providers for its individual API components -- beyond Inference, these include Vector IO, Agents, Telemetry, Evals, Post Training, etc.
To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image. To get started quickly, we provide various Docker images for the server component that work with different inference providers out of the box. For this guide, we will use `llamastack/distribution-ollama` as the Docker image.
Lets setup some environment variables that we will use in the rest of the guide.
```bash
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
LLAMA_STACK_PORT=8321
```
You can start the server using the following command:
```bash ```bash
docker run -it \ docker run -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -37,14 +53,18 @@ docker run -it \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434 --env OLLAMA_URL=http://host.docker.internal:11434
``` ```
Configuration for this is available at `distributions/ollama/run.yaml`. Configuration for this is available at `distributions/ollama/run.yaml`.
:::
### 3. Use the Llama Stack client SDK
:::{dropdown} Installing the Llama Stack client CLI and SDK
You can interact with the Llama Stack server using various client SDKs. We will use the Python SDK which you can install using the following command. Note that you must be using Python 3.10 or newer: You can interact with the Llama Stack server using various client SDKs. We will use the Python SDK which you can install using the following command. Note that you must be using Python 3.10 or newer:
```bash ```bash
yes | conda create -n stack-client python=3.10
conda activate stack-client
pip install llama-stack-client pip install llama-stack-client
``` ```
@ -66,17 +86,35 @@ llama-stack-client \
inference chat-completion \ inference chat-completion \
--message "hello, what model are you?" --message "hello, what model are you?"
``` ```
:::
Here is a simple example to perform chat completions using Python instead of the CLI. &nbsp;
### 3. Run inference with Python SDK
Here is a simple example to perform chat completions using the SDK.
```python ```python
import os import os
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}") def create_http_client():
from llama_stack_client import LlamaStackClient
return LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
def create_library_client(template="ollama"):
from llama_stack import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient(template)
client.initialize()
return client
client = create_library_client() # or create_http_client() depending on the environment you picked
# List available models # List available models
models = client.models.list() models = client.models.list()
print(models) print("--- Available models: ---")
for m in models:
print(f"- {m.identifier}")
print()
response = client.inference.chat_completion( response = client.inference.chat_completion(
model_id=os.environ["INFERENCE_MODEL"], model_id=os.environ["INFERENCE_MODEL"],
@ -90,62 +128,78 @@ print(response.completion_message.content)
### 4. Your first RAG agent ### 4. Your first RAG agent
Here is an example of a simple RAG agent that uses the Llama Stack client SDK. Here is an example of a simple RAG (Retrieval Augmented Generation) chatbot agent which can answer questions about TorchTune documentation.
```python ```python
import asyncio
import os import os
from termcolor import cprint
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types import Attachment
from llama_stack_client.types.agent_create_params import AgentConfig from llama_stack_client.types.agent_create_params import AgentConfig
from llama_stack_client.types import Document
client = create_library_client() # or create_http_client() depending on the environment you picked
async def run_main(): # Documents to be used for RAG
urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"] urls = ["chat.rst", "llama3.rst", "datasets.rst", "lora_finetune.rst"]
attachments = [ documents = [
Attachment( Document(
content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}", document_id=f"num-{i}",
mime_type="text/plain", content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
) mime_type="text/plain",
for i, url in enumerate(urls) metadata={},
]
client = LlamaStackClient(base_url=f"http://localhost:{os.environ['LLAMA_STACK_PORT']}")
agent_config = AgentConfig(
model=os.environ["INFERENCE_MODEL"],
instructions="You are a helpful assistant",
tools=[{"type": "memory"}], # enable Memory aka RAG
enable_session_persistence=True,
) )
for i, url in enumerate(urls)
]
agent = Agent(client, agent_config) # Register a vector database
session_id = agent.create_session("test-session") vector_db_id = "test-vector-db"
user_prompts = [ client.vector_dbs.register(
( vector_db_id=vector_db_id,
"I am attaching documentation for Torchtune. Help me answer questions I will ask next.", embedding_model="all-MiniLM-L6-v2",
attachments, embedding_dimension=384,
), )
(
"What are the top 5 topics that were explained? Only list succinct bullet points.",
None,
),
]
for prompt, attachments in user_prompts:
response = agent.create_turn(
messages=[{"role": "user", "content": prompt}],
attachments=attachments,
session_id=session_id,
)
for log in EventLogger().log(response):
log.print()
# Insert the documents into the vector database
client.tool_runtime.rag_tool.insert(
documents=documents,
vector_db_id=vector_db_id,
chunk_size_in_tokens=512,
)
if __name__ == "__main__": agent_config = AgentConfig(
asyncio.run(run_main()) model=os.environ["INFERENCE_MODEL"],
# Define instructions for the agent ( aka system prompt)
instructions="You are a helpful assistant",
enable_session_persistence=False,
# Define tools available to the agent
toolgroups = [
{
"name": "builtin::rag",
"args" : {
"vector_db_ids": [vector_db_id],
}
}
],
)
rag_agent = Agent(client, agent_config)
session_id = rag_agent.create_session("test-session")
user_prompts = [
"What are the top 5 topics that were explained? Only list succinct bullet points.",
]
# Run the agent loop by calling the `create_turn` method
for prompt in user_prompts:
cprint(f'User> {prompt}', 'green')
response = rag_agent.create_turn(
messages=[{"role": "user", "content": prompt}],
session_id=session_id,
)
for log in EventLogger().log(response):
log.print()
``` ```
## Next Steps ## Next Steps

View file

@ -1,23 +1,34 @@
```{admonition} News
:class: tip
Llama Stack 0.1.0 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.0) for more details.
```
# Llama Stack # Llama Stack
Llama Stack defines and standardizes the set of core building blocks needed to bring generative AI applications to market. These building blocks are presented in the form of interoperable APIs with a broad set of Service Providers providing their implementations.
Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. It provides a unified set of APIs with implementations from leading service providers, enabling seamless transitions between development and production environments. More specifically, it provides
- **Unified API layer** for Inference, RAG, Agents, Tools, Safety, Evals, and Telemetry.
- **Plugin architecture** to support the rich ecosystem of implementations of the different APIs in different environments like local development, on-premises, cloud, and mobile.
- **Prepackaged verified distributions** which offer a one-stop solution for developers to get started quickly and reliably in any environment
- **Multiple developer interfaces** like CLI and SDKs for Python, Node, iOS, and Android
- **Standalone applications** as examples for how to build production-grade AI applications with Llama Stack
We focus on making it easy to build production applications with the Llama model family - from the latest Llama 3.3 to specialized models like Llama Guard for safety.
```{image} ../_static/llama-stack.png ```{image} ../_static/llama-stack.png
:alt: Llama Stack :alt: Llama Stack
:width: 400px :width: 400px
``` ```
Our goal is to provide pre-packaged implementations which can be operated in a variety of deployment environments: developers start iterating with Desktops or their mobile devices and can seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available. Our goal is to provide pre-packaged implementations (aka "distributions") which can be run in a variety of deployment environments. LlamaStack can assist you in your entire app development lifecycle - start iterating on local, mobile or desktop and seamlessly transition to on-prem or public cloud deployments. At every point in this transition, the same set of APIs and the same developer experience is available.
```{note}
The Stack APIs are rapidly improving but still a work-in-progress. We invite feedback as well as direct contributions.
```
## Quick Links ## Quick Links
- New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision. - New to Llama Stack? Start with the [Introduction](introduction/index) to understand our motivation and vision.
- Ready to build? Check out the [Quick Start](getting_started/index) to get started. - Ready to build? Check out the [Quick Start](getting_started/index) to get started.
- Need specific providers? Browse [Distributions](distributions/index) to see all the options available. - Need specific providers? Browse [Distributions](distributions/selection) to see all the options available.
- Want to contribute? See the [Contributing](contributing/index) guide. - Want to contribute? See the [Contributing](contributing/index) guide.
## Available SDKs ## Available SDKs
@ -33,33 +44,52 @@ We have a number of client-side SDKs available for different languages.
## Supported Llama Stack Implementations ## Supported Llama Stack Implementations
A number of "adapters" are available for some popular Inference and Memory (Vector Store) providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs. A number of "adapters" are available for some popular Inference and Vector Store providers. For other APIs (particularly Safety and Agents), we provide *reference implementations* you can use to get started. We expect this list to grow over time. We are slowly onboarding more providers to the ecosystem as we get more confidence in the APIs.
**Inference API**
| **Provider** | **Environments** |
| :----: | :----: |
| Meta Reference | Single Node |
| Ollama | Single Node |
| Fireworks | Hosted |
| Together | Hosted |
| NVIDIA NIM | Hosted and Single Node |
| vLLM | Hosted and Single Node |
| TGI | Hosted and Single Node |
| AWS Bedrock | Hosted |
| Cerebras | Hosted |
| Groq | Hosted |
| SambaNova | Hosted |
| PyTorch ExecuTorch | On-device iOS, Android |
**Vector IO API**
| **Provider** | **Environments** |
| :----: | :----: |
| FAISS | Single Node |
| Chroma | Hosted and Single Node |
| Postgres (PGVector) | Hosted and Single Node |
| Weaviate | Hosted |
**Safety API**
| **Provider** | **Environments** |
| :----: | :----: |
| Llama Guard | Depends on Inference Provider |
| Prompt Guard | Single Node |
| Code Scanner | Single Node |
| AWS Bedrock | Hosted |
| **API Provider** | **Environments** | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
| :----: | :----: | :----: | :----: | :----: | :----: | :----: |
| Meta Reference | Single Node | Y | Y | Y | Y | Y |
| Cerebras | Single Node | | Y | | | |
| Fireworks | Hosted | Y | Y | Y | | |
| AWS Bedrock | Hosted | | Y | | Y | |
| Together | Hosted | Y | Y | | Y | |
| Ollama | Single Node | | Y | | |
| TGI | Hosted and Single Node | | Y | | |
| [NVIDIA NIM](https://build.nvidia.com/nim?filters=nimType%3Anim_type_run_anywhere&q=llama) | Hosted and Single Node | | Y | | |
| Chroma | Single Node | | | Y | | |
| Postgres | Single Node | | | Y | | |
| PyTorch ExecuTorch | On-device iOS | Y | Y | | |
| PyTorch ExecuTorch | On-device Android | | Y | | |
```{toctree} ```{toctree}
:hidden: :hidden:
:maxdepth: 3 :maxdepth: 3
self
introduction/index introduction/index
getting_started/index getting_started/index
concepts/index concepts/index
distributions/index distributions/index
distributions/selection
building_applications/index building_applications/index
benchmark_evaluations/index
playground/index playground/index
contributing/index contributing/index
references/index references/index

View file

@ -19,77 +19,45 @@ Building production AI applications today requires solving multiple challenges:
- Changing providers requires significant code changes. - Changing providers requires significant code changes.
### The Vision: A Universal Stack ### Our Solution: A Universal Stack
```{image} ../../_static/llama-stack.png ```{image} ../../_static/llama-stack.png
:alt: Llama Stack :alt: Llama Stack
:width: 400px :width: 400px
``` ```
Llama Stack defines and standardizes the core building blocks needed to bring generative AI applications to market. These building blocks are presented as interoperable APIs with a broad set of Service Providers providing their implementations. Llama Stack addresses these challenges through a service-oriented, API-first approach:
#### Service-oriented Design **Develop Anywhere, Deploy Everywhere**
Unlike other frameworks, Llama Stack is built with a service-oriented, REST API-first approach. Such a design not only allows for seamless transitions from local to remote deployments but also forces the design to be more declarative. This restriction can result in a much simpler, robust developer experience. The same code works across different environments: - Start locally with CPU-only setups
- Move to GPU acceleration when needed
- Deploy to cloud or edge without code changes
- Same APIs and developer experience everywhere
- Local development with CPU-only setups **Production-Ready Building Blocks**
- Self-hosted with GPU acceleration - Pre-built safety guardrails and content filtering
- Cloud-hosted on providers like AWS, Fireworks, Together - Built-in RAG and agent capabilities
- On-device for iOS and Android - Comprehensive evaluation toolkit
#### Composability
The APIs we design are composable. An Agent abstractly depends on { Inference, Memory, Safety } APIs but does not care about the actual implementation details. Safety itself may require model inference and hence can depend on the Inference API.
#### Turnkey Solutions
We provide turnkey solutions for popular deployment scenarios. It should be easy to deploy a Llama Stack server on AWS or in a private data center. Either of these should allow a developer to get started with powerful agentic apps, model evaluations, or fine-tuning services in minutes.
We have built-in support for critical needs:
- Safety guardrails and content filtering
- Comprehensive evaluation capabilities
- Full observability and monitoring - Full observability and monitoring
- Provider federation and fallback
#### Focus on Llama Models **True Provider Independence**
As a Meta-initiated project, we explicitly focus on Meta's Llama series of models. Supporting the broad set of open models is no easy task and we want to start with models we understand best. - Swap providers without application changes
- Mix and match best-in-class implementations
- Federation and fallback support
- No vendor lock-in
#### Supporting the Ecosystem **Robust Ecosystem**
There is a vibrant ecosystem of Providers which provide efficient inference or scalable vector stores or powerful observability solutions. We want to make sure it is easy for developers to pick and choose the best implementations for their use cases. We also want to make sure it is easy for new Providers to onboard and participate in the ecosystem. - Llama Stack is already integrated with distribution partners (cloud providers, hardware vendors, and AI-focused companies).
- Ecosystem offers tailored infrastructure, software, and services for deploying Llama models.
Additionally, we have designed every element of the Stack such that APIs as well as Resources (like Models) can be federated.
#### Rich Provider Ecosystem
```{list-table}
:header-rows: 1
* - Provider
- Local
- Self-hosted
- Cloud
* - Inference
- Ollama
- vLLM, TGI
- Fireworks, Together, AWS
* - Memory
- FAISS
- Chroma, pgvector
- Weaviate
* - Safety
- Llama Guard
- -
- AWS Bedrock
```
### Unified API Layer ### Our Philosophy
Llama Stack provides a consistent interface for: - **Service-Oriented**: REST APIs enforce clean interfaces and enable seamless transitions across different environments.
- **Composability**: Every component is independent but works together seamlessly
- **Production Ready**: Built for real-world applications, not just demos
- **Turnkey Solutions**: Easy to deploy built in solutions for popular deployment scenarios
- **Llama First**: Explicit focus on Meta's Llama models and partnering ecosystem
- **Inference**: Run LLM models efficiently
- **Safety**: Apply content filtering and safety policies With Llama Stack, you can focus on building your application while we handle the infrastructure complexity, essential capabilities, and provider integrations.
- **Memory**: Store and retrieve knowledge for RAG
- **Agents**: Build multi-step workflows
- **Evaluation**: Test and improve application quality

View file

@ -103,36 +103,35 @@ $ llama-stack-client models update <model_id> [--provider-id <provider_id>] [--p
$ llama-stack-client models delete <model_id> $ llama-stack-client models delete <model_id>
``` ```
## Memory Bank Management ## Vector DB Management
### `llama-stack-client memory_banks list` ### `llama-stack-client vector_dbs list`
```bash ```bash
$ llama-stack-client memory_banks list $ llama-stack-client vector_dbs list
``` ```
``` ```
+--------------+----------------+--------+-------------------+------------------------+--------------------------+ +--------------+----------------+---------------------+---------------+------------------------+
| identifier | provider_id | type | embedding_model | chunk_size_in_tokens | overlap_size_in_tokens | | identifier | provider_id | provider_resource_id| vector_db_type| params |
+==============+================+========+===================+========================+==========================+ +==============+================+=====================+===============+========================+
| test_bank | meta-reference | vector | all-MiniLM-L6-v2 | 512 | 64 | | test_bank | meta-reference | test_bank | vector | embedding_model: all-MiniLM-L6-v2
+--------------+----------------+--------+-------------------+------------------------+--------------------------+ embedding_dimension: 384|
+--------------+----------------+---------------------+---------------+------------------------+
``` ```
### `llama-stack-client memory_banks register` ### `llama-stack-client vector_dbs register`
```bash ```bash
$ llama-stack-client memory_banks register <memory-bank-id> --type <type> [--provider-id <provider-id>] [--provider-memory-bank-id <provider-memory-bank-id>] [--chunk-size <chunk-size>] [--embedding-model <embedding-model>] [--overlap-size <overlap-size>] $ llama-stack-client vector_dbs register <vector-db-id> [--provider-id <provider-id>] [--provider-vector-db-id <provider-vector-db-id>] [--embedding-model <embedding-model>] [--embedding-dimension <embedding-dimension>]
``` ```
Options: Options:
- `--type`: Required. Type of memory bank. Choices: "vector", "keyvalue", "keyword", "graph" - `--provider-id`: Optional. Provider ID for the vector db
- `--provider-id`: Optional. Provider ID for the memory bank - `--provider-vector-db-id`: Optional. Provider's vector db ID
- `--provider-memory-bank-id`: Optional. Provider's memory bank ID - `--embedding-model`: Optional. Embedding model to use. Default: "all-MiniLM-L6-v2"
- `--chunk-size`: Optional. Chunk size in tokens (for vector type). Default: 512 - `--embedding-dimension`: Optional. Dimension of embeddings. Default: 384
- `--embedding-model`: Optional. Embedding model (for vector type). Default: "all-MiniLM-L6-v2"
- `--overlap-size`: Optional. Overlap size in tokens (for vector type). Default: 64
### `llama-stack-client memory_banks unregister` ### `llama-stack-client vector_dbs unregister`
```bash ```bash
$ llama-stack-client memory_banks unregister <memory-bank-id> $ llama-stack-client vector_dbs unregister <vector-db-id>
``` ```
## Shield Management ## Shield Management
@ -200,11 +199,7 @@ Example eval_task_config.json:
"type": "model", "type": "model",
"model": "Llama3.1-405B-Instruct", "model": "Llama3.1-405B-Instruct",
"sampling_params": { "sampling_params": {
"strategy": { "strategy": "greedy",
"type": "greedy"
},
"max_tokens": 0,
"repetition_penalty": 1.0
} }
} }
} }
@ -220,3 +215,44 @@ Options:
- `--output-dir`: Required. Path to the directory where scoring results will be saved - `--output-dir`: Required. Path to the directory where scoring results will be saved
- `--num-examples`: Optional. Number of examples to evaluate (useful for debugging) - `--num-examples`: Optional. Number of examples to evaluate (useful for debugging)
- `--visualize`: Optional flag. If set, visualizes scoring results after completion - `--visualize`: Optional flag. If set, visualizes scoring results after completion
## Tool Group Management
### `llama-stack-client toolgroups list`
```bash
$ llama-stack-client toolgroups list
```
```
+---------------------------+------------------+------+---------------+
| identifier | provider_id | args | mcp_endpoint |
+===========================+==================+======+===============+
| builtin::code_interpreter | code-interpreter | None | None |
+---------------------------+------------------+------+---------------+
| builtin::rag | rag-runtime | None | None |
+---------------------------+------------------+------+---------------+
| builtin::websearch | tavily-search | None | None |
+---------------------------+------------------+------+---------------+
```
### `llama-stack-client toolgroups get`
```bash
$ llama-stack-client toolgroups get <toolgroup_id>
```
Shows detailed information about a specific toolgroup. If the toolgroup is not found, displays an error message.
### `llama-stack-client toolgroups register`
```bash
$ llama-stack-client toolgroups register <toolgroup_id> [--provider-id <provider-id>] [--provider-toolgroup-id <provider-toolgroup-id>] [--mcp-config <mcp-config>] [--args <args>]
```
Options:
- `--provider-id`: Optional. Provider ID for the toolgroup
- `--provider-toolgroup-id`: Optional. Provider's toolgroup ID
- `--mcp-config`: Optional. JSON configuration for the MCP endpoint
- `--args`: Optional. JSON arguments for the toolgroup
### `llama-stack-client toolgroups unregister`
```bash
$ llama-stack-client toolgroups unregister <toolgroup_id>
```

View file

@ -4,29 +4,77 @@
```python ```python
from llama_stack_client.types import ( from llama_stack_client.types import (
Attachment, AgentConfig,
BatchCompletion, BatchCompletion,
CompletionMessage, CompletionMessage,
ContentDelta,
Document,
InterleavedContent,
InterleavedContentItem,
Message,
ParamType,
QueryConfig,
QueryResult,
ReturnType,
SafetyViolation,
SamplingParams, SamplingParams,
ScoringResult,
SystemMessage, SystemMessage,
ToolCall, ToolCall,
ToolParamDefinition,
ToolResponseMessage, ToolResponseMessage,
URL,
UserMessage, UserMessage,
) )
``` ```
## Telemetry ## Toolgroups
Types: Types:
```python ```python
from llama_stack_client.types import TelemetryGetTraceResponse from llama_stack_client.types import ListToolGroupsResponse, ToolGroup, ToolgroupListResponse
``` ```
Methods: Methods:
- <code title="get /telemetry/get_trace">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_trace</a>(\*\*<a href="src/llama_stack_client/types/telemetry_get_trace_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_get_trace_response.py">TelemetryGetTraceResponse</a></code> - <code title="get /v1/toolgroups">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">list</a>() -> <a href="./src/llama_stack_client/types/toolgroup_list_response.py">ToolgroupListResponse</a></code>
- <code title="post /telemetry/log_event">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">log</a>(\*\*<a href="src/llama_stack_client/types/telemetry_log_params.py">params</a>) -> None</code> - <code title="get /v1/toolgroups/{toolgroup_id}">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">get</a>(toolgroup_id) -> <a href="./src/llama_stack_client/types/tool_group.py">ToolGroup</a></code>
- <code title="post /v1/toolgroups">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">register</a>(\*\*<a href="src/llama_stack_client/types/toolgroup_register_params.py">params</a>) -> None</code>
- <code title="delete /v1/toolgroups/{toolgroup_id}">client.toolgroups.<a href="./src/llama_stack_client/resources/toolgroups.py">unregister</a>(toolgroup_id) -> None</code>
## Tools
Types:
```python
from llama_stack_client.types import ListToolsResponse, Tool, ToolListResponse
```
Methods:
- <code title="get /v1/tools">client.tools.<a href="./src/llama_stack_client/resources/tools.py">list</a>(\*\*<a href="src/llama_stack_client/types/tool_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/tool_list_response.py">ToolListResponse</a></code>
- <code title="get /v1/tools/{tool_name}">client.tools.<a href="./src/llama_stack_client/resources/tools.py">get</a>(tool_name) -> <a href="./src/llama_stack_client/types/tool.py">Tool</a></code>
## ToolRuntime
Types:
```python
from llama_stack_client.types import ToolDef, ToolInvocationResult
```
Methods:
- <code title="post /v1/tool-runtime/invoke">client.tool_runtime.<a href="./src/llama_stack_client/resources/tool_runtime/tool_runtime.py">invoke_tool</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime_invoke_tool_params.py">params</a>) -> <a href="./src/llama_stack_client/types/tool_invocation_result.py">ToolInvocationResult</a></code>
- <code title="get /v1/tool-runtime/list-tools">client.tool_runtime.<a href="./src/llama_stack_client/resources/tool_runtime/tool_runtime.py">list_tools</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime_list_tools_params.py">params</a>) -> <a href="./src/llama_stack_client/types/tool_def.py">JSONLDecoder[ToolDef]</a></code>
### RagTool
Methods:
- <code title="post /v1/tool-runtime/rag-tool/insert">client.tool_runtime.rag_tool.<a href="./src/llama_stack_client/resources/tool_runtime/rag_tool.py">insert</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime/rag_tool_insert_params.py">params</a>) -> None</code>
- <code title="post /v1/tool-runtime/rag-tool/query">client.tool_runtime.rag_tool.<a href="./src/llama_stack_client/resources/tool_runtime/rag_tool.py">query</a>(\*\*<a href="src/llama_stack_client/types/tool_runtime/rag_tool_query_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/query_result.py">QueryResult</a></code>
## Agents ## Agents
@ -36,20 +84,19 @@ Types:
from llama_stack_client.types import ( from llama_stack_client.types import (
InferenceStep, InferenceStep,
MemoryRetrievalStep, MemoryRetrievalStep,
RestAPIExecutionConfig,
ShieldCallStep, ShieldCallStep,
ToolExecutionStep, ToolExecutionStep,
ToolParamDefinition, ToolResponse,
AgentCreateResponse, AgentCreateResponse,
) )
``` ```
Methods: Methods:
- <code title="post /agents/create">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">create</a>(\*\*<a href="src/llama_stack_client/types/agent_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agent_create_response.py">AgentCreateResponse</a></code> - <code title="post /v1/agents">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">create</a>(\*\*<a href="src/llama_stack_client/types/agent_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agent_create_response.py">AgentCreateResponse</a></code>
- <code title="post /agents/delete">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">delete</a>(\*\*<a href="src/llama_stack_client/types/agent_delete_params.py">params</a>) -> None</code> - <code title="delete /v1/agents/{agent_id}">client.agents.<a href="./src/llama_stack_client/resources/agents/agents.py">delete</a>(agent_id) -> None</code>
### Sessions ### Session
Types: Types:
@ -59,104 +106,106 @@ from llama_stack_client.types.agents import Session, SessionCreateResponse
Methods: Methods:
- <code title="post /agents/session/create">client.agents.sessions.<a href="./src/llama_stack_client/resources/agents/sessions.py">create</a>(\*\*<a href="src/llama_stack_client/types/agents/session_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session_create_response.py">SessionCreateResponse</a></code> - <code title="post /v1/agents/{agent_id}/session">client.agents.session.<a href="./src/llama_stack_client/resources/agents/session.py">create</a>(agent_id, \*\*<a href="src/llama_stack_client/types/agents/session_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session_create_response.py">SessionCreateResponse</a></code>
- <code title="post /agents/session/get">client.agents.sessions.<a href="./src/llama_stack_client/resources/agents/sessions.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/agents/session_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session.py">Session</a></code> - <code title="get /v1/agents/{agent_id}/session/{session_id}">client.agents.session.<a href="./src/llama_stack_client/resources/agents/session.py">retrieve</a>(session_id, \*, agent_id, \*\*<a href="src/llama_stack_client/types/agents/session_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/session.py">Session</a></code>
- <code title="post /agents/session/delete">client.agents.sessions.<a href="./src/llama_stack_client/resources/agents/sessions.py">delete</a>(\*\*<a href="src/llama_stack_client/types/agents/session_delete_params.py">params</a>) -> None</code> - <code title="delete /v1/agents/{agent_id}/session/{session_id}">client.agents.session.<a href="./src/llama_stack_client/resources/agents/session.py">delete</a>(session_id, \*, agent_id) -> None</code>
### Steps ### Steps
Types: Types:
```python ```python
from llama_stack_client.types.agents import AgentsStep from llama_stack_client.types.agents import StepRetrieveResponse
``` ```
Methods: Methods:
- <code title="get /agents/step/get">client.agents.steps.<a href="./src/llama_stack_client/resources/agents/steps.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/agents/step_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/agents_step.py">AgentsStep</a></code> - <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}">client.agents.steps.<a href="./src/llama_stack_client/resources/agents/steps.py">retrieve</a>(step_id, \*, agent_id, session_id, turn_id) -> <a href="./src/llama_stack_client/types/agents/step_retrieve_response.py">StepRetrieveResponse</a></code>
### Turns ### Turn
Types: Types:
```python ```python
from llama_stack_client.types.agents import AgentsTurnStreamChunk, Turn, TurnStreamEvent from llama_stack_client.types.agents import Turn, TurnCreateResponse
``` ```
Methods: Methods:
- <code title="post /agents/turn/create">client.agents.turns.<a href="./src/llama_stack_client/resources/agents/turns.py">create</a>(\*\*<a href="src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/agents_turn_stream_chunk.py">AgentsTurnStreamChunk</a></code> - <code title="post /v1/agents/{agent_id}/session/{session_id}/turn">client.agents.turn.<a href="./src/llama_stack_client/resources/agents/turn.py">create</a>(session_id, \*, agent_id, \*\*<a href="src/llama_stack_client/types/agents/turn_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/turn_create_response.py">TurnCreateResponse</a></code>
- <code title="get /agents/turn/get">client.agents.turns.<a href="./src/llama_stack_client/resources/agents/turns.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/agents/turn_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/agents/turn.py">Turn</a></code> - <code title="get /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}">client.agents.turn.<a href="./src/llama_stack_client/resources/agents/turn.py">retrieve</a>(turn_id, \*, agent_id, session_id) -> <a href="./src/llama_stack_client/types/agents/turn.py">Turn</a></code>
## BatchInference
Types:
```python
from llama_stack_client.types import BatchInferenceChatCompletionResponse
```
Methods:
- <code title="post /v1/batch-inference/chat-completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/batch_inference_chat_completion_response.py">BatchInferenceChatCompletionResponse</a></code>
- <code title="post /v1/batch-inference/completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>
## Datasets ## Datasets
Types: Types:
```python ```python
from llama_stack_client.types import TrainEvalDataset from llama_stack_client.types import (
ListDatasetsResponse,
DatasetRetrieveResponse,
DatasetListResponse,
)
``` ```
Methods: Methods:
- <code title="post /datasets/create">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">create</a>(\*\*<a href="src/llama_stack_client/types/dataset_create_params.py">params</a>) -> None</code> - <code title="get /v1/datasets/{dataset_id}">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">retrieve</a>(dataset_id) -> <a href="./src/llama_stack_client/types/dataset_retrieve_response.py">Optional[DatasetRetrieveResponse]</a></code>
- <code title="post /datasets/delete">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">delete</a>(\*\*<a href="src/llama_stack_client/types/dataset_delete_params.py">params</a>) -> None</code> - <code title="get /v1/datasets">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">list</a>() -> <a href="./src/llama_stack_client/types/dataset_list_response.py">DatasetListResponse</a></code>
- <code title="get /datasets/get">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">get</a>(\*\*<a href="src/llama_stack_client/types/dataset_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/train_eval_dataset.py">TrainEvalDataset</a></code> - <code title="post /v1/datasets">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">register</a>(\*\*<a href="src/llama_stack_client/types/dataset_register_params.py">params</a>) -> None</code>
- <code title="delete /v1/datasets/{dataset_id}">client.datasets.<a href="./src/llama_stack_client/resources/datasets.py">unregister</a>(dataset_id) -> None</code>
## Evaluate ## Eval
Types: Types:
```python ```python
from llama_stack_client.types import EvaluationJob from llama_stack_client.types import EvaluateResponse, Job
``` ```
Methods:
- <code title="post /v1/eval/tasks/{task_id}/evaluations">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">evaluate_rows</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_evaluate_rows_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="post /v1/eval/tasks/{task_id}/jobs">client.eval.<a href="./src/llama_stack_client/resources/eval/eval.py">run_eval</a>(task_id, \*\*<a href="src/llama_stack_client/types/eval_run_eval_params.py">params</a>) -> <a href="./src/llama_stack_client/types/job.py">Job</a></code>
### Jobs ### Jobs
Types: Types:
```python ```python
from llama_stack_client.types.evaluate import ( from llama_stack_client.types.eval import JobStatusResponse
EvaluationJobArtifacts,
EvaluationJobLogStream,
EvaluationJobStatus,
)
``` ```
Methods: Methods:
- <code title="get /evaluate/jobs">client.evaluate.jobs.<a href="./src/llama_stack_client/resources/evaluate/jobs/jobs.py">list</a>() -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code> - <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}/result">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">retrieve</a>(job_id, \*, task_id) -> <a href="./src/llama_stack_client/types/evaluate_response.py">EvaluateResponse</a></code>
- <code title="post /evaluate/job/cancel">client.evaluate.jobs.<a href="./src/llama_stack_client/resources/evaluate/jobs/jobs.py">cancel</a>(\*\*<a href="src/llama_stack_client/types/evaluate/job_cancel_params.py">params</a>) -> None</code> - <code title="delete /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">cancel</a>(job_id, \*, task_id) -> None</code>
- <code title="get /v1/eval/tasks/{task_id}/jobs/{job_id}">client.eval.jobs.<a href="./src/llama_stack_client/resources/eval/jobs.py">status</a>(job_id, \*, task_id) -> Optional[JobStatusResponse]</code>
#### Artifacts ## Inspect
Types:
```python
from llama_stack_client.types import HealthInfo, ProviderInfo, RouteInfo, VersionInfo
```
Methods: Methods:
- <code title="get /evaluate/job/artifacts">client.evaluate.jobs.artifacts.<a href="./src/llama_stack_client/resources/evaluate/jobs/artifacts.py">list</a>(\*\*<a href="src/llama_stack_client/types/evaluate/jobs/artifact_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate/evaluation_job_artifacts.py">EvaluationJobArtifacts</a></code> - <code title="get /v1/health">client.inspect.<a href="./src/llama_stack_client/resources/inspect.py">health</a>() -> <a href="./src/llama_stack_client/types/health_info.py">HealthInfo</a></code>
- <code title="get /v1/version">client.inspect.<a href="./src/llama_stack_client/resources/inspect.py">version</a>() -> <a href="./src/llama_stack_client/types/version_info.py">VersionInfo</a></code>
#### Logs
Methods:
- <code title="get /evaluate/job/logs">client.evaluate.jobs.logs.<a href="./src/llama_stack_client/resources/evaluate/jobs/logs.py">list</a>(\*\*<a href="src/llama_stack_client/types/evaluate/jobs/log_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate/evaluation_job_log_stream.py">EvaluationJobLogStream</a></code>
#### Status
Methods:
- <code title="get /evaluate/job/status">client.evaluate.jobs.status.<a href="./src/llama_stack_client/resources/evaluate/jobs/status.py">list</a>(\*\*<a href="src/llama_stack_client/types/evaluate/jobs/status_list_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluate/evaluation_job_status.py">EvaluationJobStatus</a></code>
### QuestionAnswering
Methods:
- <code title="post /evaluate/question_answering/">client.evaluate.question_answering.<a href="./src/llama_stack_client/resources/evaluate/question_answering.py">create</a>(\*\*<a href="src/llama_stack_client/types/evaluate/question_answering_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code>
## Evaluations
Methods:
- <code title="post /evaluate/summarization/">client.evaluations.<a href="./src/llama_stack_client/resources/evaluations.py">summarization</a>(\*\*<a href="src/llama_stack_client/types/evaluation_summarization_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code>
- <code title="post /evaluate/text_generation/">client.evaluations.<a href="./src/llama_stack_client/resources/evaluations.py">text_generation</a>(\*\*<a href="src/llama_stack_client/types/evaluation_text_generation_params.py">params</a>) -> <a href="./src/llama_stack_client/types/evaluation_job.py">EvaluationJob</a></code>
## Inference ## Inference
@ -164,8 +213,8 @@ Types:
```python ```python
from llama_stack_client.types import ( from llama_stack_client.types import (
ChatCompletionStreamChunk, CompletionResponse,
CompletionStreamChunk, EmbeddingsResponse,
TokenLogProbs, TokenLogProbs,
InferenceChatCompletionResponse, InferenceChatCompletionResponse,
InferenceCompletionResponse, InferenceCompletionResponse,
@ -174,175 +223,232 @@ from llama_stack_client.types import (
Methods: Methods:
- <code title="post /inference/chat_completion">client.inference.<a href="./src/llama_stack_client/resources/inference/inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code> - <code title="post /v1/inference/chat-completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_chat_completion_response.py">InferenceChatCompletionResponse</a></code>
- <code title="post /inference/completion">client.inference.<a href="./src/llama_stack_client/resources/inference/inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code> - <code title="post /v1/inference/completion">client.inference.<a href="./src/llama_stack_client/resources/inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference_completion_response.py">InferenceCompletionResponse</a></code>
- <code title="post /v1/inference/embeddings">client.inference.<a href="./src/llama_stack_client/resources/inference.py">embeddings</a>(\*\*<a href="src/llama_stack_client/types/inference_embeddings_params.py">params</a>) -> <a href="./src/llama_stack_client/types/embeddings_response.py">EmbeddingsResponse</a></code>
### Embeddings ## VectorIo
Types: Types:
```python ```python
from llama_stack_client.types.inference import Embeddings from llama_stack_client.types import QueryChunksResponse
``` ```
Methods: Methods:
- <code title="post /inference/embeddings">client.inference.embeddings.<a href="./src/llama_stack_client/resources/inference/embeddings.py">create</a>(\*\*<a href="src/llama_stack_client/types/inference/embedding_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/inference/embeddings.py">Embeddings</a></code> - <code title="post /v1/vector-io/insert">client.vector_io.<a href="./src/llama_stack_client/resources/vector_io.py">insert</a>(\*\*<a href="src/llama_stack_client/types/vector_io_insert_params.py">params</a>) -> None</code>
- <code title="post /v1/vector-io/query">client.vector_io.<a href="./src/llama_stack_client/resources/vector_io.py">query</a>(\*\*<a href="src/llama_stack_client/types/vector_io_query_params.py">params</a>) -> <a href="./src/llama_stack_client/types/query_chunks_response.py">QueryChunksResponse</a></code>
## Safety ## VectorDBs
Types:
```python
from llama_stack_client.types import RunSheidResponse
```
Methods:
- <code title="post /safety/run_shield">client.safety.<a href="./src/llama_stack_client/resources/safety.py">run_shield</a>(\*\*<a href="src/llama_stack_client/types/safety_run_shield_params.py">params</a>) -> <a href="./src/llama_stack_client/types/run_sheid_response.py">RunSheidResponse</a></code>
## Memory
Types: Types:
```python ```python
from llama_stack_client.types import ( from llama_stack_client.types import (
QueryDocuments, ListVectorDBsResponse,
MemoryCreateResponse, VectorDBRetrieveResponse,
MemoryRetrieveResponse, VectorDBListResponse,
MemoryListResponse, VectorDBRegisterResponse,
MemoryDropResponse,
) )
``` ```
Methods: Methods:
- <code title="post /memory/create">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">create</a>(\*\*<a href="src/llama_stack_client/types/memory_create_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory_create_response.py">object</a></code> - <code title="get /v1/vector-dbs/{vector_db_id}">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">retrieve</a>(vector_db_id) -> <a href="./src/llama_stack_client/types/vector_db_retrieve_response.py">Optional[VectorDBRetrieveResponse]</a></code>
- <code title="get /memory/get">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/memory_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory_retrieve_response.py">object</a></code> - <code title="get /v1/vector-dbs">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">list</a>() -> <a href="./src/llama_stack_client/types/vector_db_list_response.py">VectorDBListResponse</a></code>
- <code title="post /memory/update">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">update</a>(\*\*<a href="src/llama_stack_client/types/memory_update_params.py">params</a>) -> None</code> - <code title="post /v1/vector-dbs">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">register</a>(\*\*<a href="src/llama_stack_client/types/vector_db_register_params.py">params</a>) -> <a href="./src/llama_stack_client/types/vector_db_register_response.py">VectorDBRegisterResponse</a></code>
- <code title="get /memory/list">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">list</a>() -> <a href="./src/llama_stack_client/types/memory_list_response.py">object</a></code> - <code title="delete /v1/vector-dbs/{vector_db_id}">client.vector_dbs.<a href="./src/llama_stack_client/resources/vector_dbs.py">unregister</a>(vector_db_id) -> None</code>
- <code title="post /memory/drop">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">drop</a>(\*\*<a href="src/llama_stack_client/types/memory_drop_params.py">params</a>) -> str</code>
- <code title="post /memory/insert">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">insert</a>(\*\*<a href="src/llama_stack_client/types/memory_insert_params.py">params</a>) -> None</code>
- <code title="post /memory/query">client.memory.<a href="./src/llama_stack_client/resources/memory/memory.py">query</a>(\*\*<a href="src/llama_stack_client/types/memory_query_params.py">params</a>) -> <a href="./src/llama_stack_client/types/query_documents.py">QueryDocuments</a></code>
### Documents
Types:
```python
from llama_stack_client.types.memory import DocumentRetrieveResponse
```
Methods:
- <code title="post /memory/documents/get">client.memory.documents.<a href="./src/llama_stack_client/resources/memory/documents.py">retrieve</a>(\*\*<a href="src/llama_stack_client/types/memory/document_retrieve_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory/document_retrieve_response.py">DocumentRetrieveResponse</a></code>
- <code title="post /memory/documents/delete">client.memory.documents.<a href="./src/llama_stack_client/resources/memory/documents.py">delete</a>(\*\*<a href="src/llama_stack_client/types/memory/document_delete_params.py">params</a>) -> None</code>
## PostTraining
Types:
```python
from llama_stack_client.types import PostTrainingJob
```
Methods:
- <code title="post /post_training/preference_optimize">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">preference_optimize</a>(\*\*<a href="src/llama_stack_client/types/post_training_preference_optimize_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
- <code title="post /post_training/supervised_fine_tune">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">supervised_fine_tune</a>(\*\*<a href="src/llama_stack_client/types/post_training_supervised_fine_tune_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
### Jobs
Types:
```python
from llama_stack_client.types.post_training import (
PostTrainingJobArtifacts,
PostTrainingJobLogStream,
PostTrainingJobStatus,
)
```
Methods:
- <code title="get /post_training/jobs">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">list</a>() -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
- <code title="get /post_training/job/artifacts">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">artifacts</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_artifacts_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/post_training_job_artifacts.py">PostTrainingJobArtifacts</a></code>
- <code title="post /post_training/job/cancel">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">cancel</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_cancel_params.py">params</a>) -> None</code>
- <code title="get /post_training/job/logs">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">logs</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_logs_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/post_training_job_log_stream.py">PostTrainingJobLogStream</a></code>
- <code title="get /post_training/job/status">client.post_training.jobs.<a href="./src/llama_stack_client/resources/post_training/jobs.py">status</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_status_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/post_training_job_status.py">PostTrainingJobStatus</a></code>
## RewardScoring
Types:
```python
from llama_stack_client.types import RewardScoring, ScoredDialogGenerations
```
Methods:
- <code title="post /reward_scoring/score">client.reward_scoring.<a href="./src/llama_stack_client/resources/reward_scoring.py">score</a>(\*\*<a href="src/llama_stack_client/types/reward_scoring_score_params.py">params</a>) -> <a href="./src/llama_stack_client/types/reward_scoring.py">RewardScoring</a></code>
## SyntheticDataGeneration
Types:
```python
from llama_stack_client.types import SyntheticDataGeneration
```
Methods:
- <code title="post /synthetic_data_generation/generate">client.synthetic_data_generation.<a href="./src/llama_stack_client/resources/synthetic_data_generation.py">generate</a>(\*\*<a href="src/llama_stack_client/types/synthetic_data_generation_generate_params.py">params</a>) -> <a href="./src/llama_stack_client/types/synthetic_data_generation.py">SyntheticDataGeneration</a></code>
## BatchInference
Types:
```python
from llama_stack_client.types import BatchChatCompletion
```
Methods:
- <code title="post /batch_inference/chat_completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">chat_completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_chat_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/batch_chat_completion.py">BatchChatCompletion</a></code>
- <code title="post /batch_inference/completion">client.batch_inference.<a href="./src/llama_stack_client/resources/batch_inference.py">completion</a>(\*\*<a href="src/llama_stack_client/types/batch_inference_completion_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shared/batch_completion.py">BatchCompletion</a></code>
## Models ## Models
Types: Types:
```python ```python
from llama_stack_client.types import ModelServingSpec from llama_stack_client.types import ListModelsResponse, Model, ModelListResponse
``` ```
Methods: Methods:
- <code title="get /models/list">client.models.<a href="./src/llama_stack_client/resources/models.py">list</a>() -> <a href="./src/llama_stack_client/types/model_serving_spec.py">ModelServingSpec</a></code> - <code title="get /v1/models/{model_id}">client.models.<a href="./src/llama_stack_client/resources/models.py">retrieve</a>(model_id) -> <a href="./src/llama_stack_client/types/model.py">Optional[Model]</a></code>
- <code title="get /models/get">client.models.<a href="./src/llama_stack_client/resources/models.py">get</a>(\*\*<a href="src/llama_stack_client/types/model_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/model_serving_spec.py">Optional</a></code> - <code title="get /v1/models">client.models.<a href="./src/llama_stack_client/resources/models.py">list</a>() -> <a href="./src/llama_stack_client/types/model_list_response.py">ModelListResponse</a></code>
- <code title="post /v1/models">client.models.<a href="./src/llama_stack_client/resources/models.py">register</a>(\*\*<a href="src/llama_stack_client/types/model_register_params.py">params</a>) -> <a href="./src/llama_stack_client/types/model.py">Model</a></code>
- <code title="delete /v1/models/{model_id}">client.models.<a href="./src/llama_stack_client/resources/models.py">unregister</a>(model_id) -> None</code>
## MemoryBanks ## PostTraining
Types: Types:
```python ```python
from llama_stack_client.types import MemoryBankSpec from llama_stack_client.types import ListPostTrainingJobsResponse, PostTrainingJob
``` ```
Methods: Methods:
- <code title="get /memory_banks/list">client.memory_banks.<a href="./src/llama_stack_client/resources/memory_banks.py">list</a>() -> <a href="./src/llama_stack_client/types/memory_bank_spec.py">MemoryBankSpec</a></code> - <code title="post /v1/post-training/preference-optimize">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">preference_optimize</a>(\*\*<a href="src/llama_stack_client/types/post_training_preference_optimize_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
- <code title="get /memory_banks/get">client.memory_banks.<a href="./src/llama_stack_client/resources/memory_banks.py">get</a>(\*\*<a href="src/llama_stack_client/types/memory_bank_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/memory_bank_spec.py">Optional</a></code> - <code title="post /v1/post-training/supervised-fine-tune">client.post_training.<a href="./src/llama_stack_client/resources/post_training/post_training.py">supervised_fine_tune</a>(\*\*<a href="src/llama_stack_client/types/post_training_supervised_fine_tune_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training_job.py">PostTrainingJob</a></code>
### Job
Types:
```python
from llama_stack_client.types.post_training import (
JobListResponse,
JobArtifactsResponse,
JobStatusResponse,
)
```
Methods:
- <code title="get /v1/post-training/jobs">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">list</a>() -> <a href="./src/llama_stack_client/types/post_training/job_list_response.py">JobListResponse</a></code>
- <code title="get /v1/post-training/job/artifacts">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">artifacts</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_artifacts_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/job_artifacts_response.py">Optional[JobArtifactsResponse]</a></code>
- <code title="post /v1/post-training/job/cancel">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">cancel</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_cancel_params.py">params</a>) -> None</code>
- <code title="get /v1/post-training/job/status">client.post_training.job.<a href="./src/llama_stack_client/resources/post_training/job.py">status</a>(\*\*<a href="src/llama_stack_client/types/post_training/job_status_params.py">params</a>) -> <a href="./src/llama_stack_client/types/post_training/job_status_response.py">Optional[JobStatusResponse]</a></code>
## Providers
Types:
```python
from llama_stack_client.types import ListProvidersResponse, ProviderListResponse
```
Methods:
- <code title="get /v1/inspect/providers">client.providers.<a href="./src/llama_stack_client/resources/providers.py">list</a>() -> <a href="./src/llama_stack_client/types/provider_list_response.py">ProviderListResponse</a></code>
## Routes
Types:
```python
from llama_stack_client.types import ListRoutesResponse, RouteListResponse
```
Methods:
- <code title="get /v1/inspect/routes">client.routes.<a href="./src/llama_stack_client/resources/routes.py">list</a>() -> <a href="./src/llama_stack_client/types/route_list_response.py">RouteListResponse</a></code>
## Safety
Types:
```python
from llama_stack_client.types import RunShieldResponse
```
Methods:
- <code title="post /v1/safety/run-shield">client.safety.<a href="./src/llama_stack_client/resources/safety.py">run_shield</a>(\*\*<a href="src/llama_stack_client/types/safety_run_shield_params.py">params</a>) -> <a href="./src/llama_stack_client/types/run_shield_response.py">RunShieldResponse</a></code>
## Shields ## Shields
Types: Types:
```python ```python
from llama_stack_client.types import ShieldSpec from llama_stack_client.types import ListShieldsResponse, Shield, ShieldListResponse
``` ```
Methods: Methods:
- <code title="get /shields/list">client.shields.<a href="./src/llama_stack_client/resources/shields.py">list</a>() -> <a href="./src/llama_stack_client/types/shield_spec.py">ShieldSpec</a></code> - <code title="get /v1/shields/{identifier}">client.shields.<a href="./src/llama_stack_client/resources/shields.py">retrieve</a>(identifier) -> <a href="./src/llama_stack_client/types/shield.py">Optional[Shield]</a></code>
- <code title="get /shields/get">client.shields.<a href="./src/llama_stack_client/resources/shields.py">get</a>(\*\*<a href="src/llama_stack_client/types/shield_get_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shield_spec.py">Optional</a></code> - <code title="get /v1/shields">client.shields.<a href="./src/llama_stack_client/resources/shields.py">list</a>() -> <a href="./src/llama_stack_client/types/shield_list_response.py">ShieldListResponse</a></code>
- <code title="post /v1/shields">client.shields.<a href="./src/llama_stack_client/resources/shields.py">register</a>(\*\*<a href="src/llama_stack_client/types/shield_register_params.py">params</a>) -> <a href="./src/llama_stack_client/types/shield.py">Shield</a></code>
## SyntheticDataGeneration
Types:
```python
from llama_stack_client.types import SyntheticDataGenerationResponse
```
Methods:
- <code title="post /v1/synthetic-data-generation/generate">client.synthetic_data_generation.<a href="./src/llama_stack_client/resources/synthetic_data_generation.py">generate</a>(\*\*<a href="src/llama_stack_client/types/synthetic_data_generation_generate_params.py">params</a>) -> <a href="./src/llama_stack_client/types/synthetic_data_generation_response.py">SyntheticDataGenerationResponse</a></code>
## Telemetry
Types:
```python
from llama_stack_client.types import (
QuerySpansResponse,
SpanWithStatus,
Trace,
TelemetryGetSpanResponse,
TelemetryGetSpanTreeResponse,
TelemetryQuerySpansResponse,
TelemetryQueryTracesResponse,
)
```
Methods:
- <code title="get /v1/telemetry/traces/{trace_id}/spans/{span_id}">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_span</a>(span_id, \*, trace_id) -> <a href="./src/llama_stack_client/types/telemetry_get_span_response.py">TelemetryGetSpanResponse</a></code>
- <code title="get /v1/telemetry/spans/{span_id}/tree">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_span_tree</a>(span_id, \*\*<a href="src/llama_stack_client/types/telemetry_get_span_tree_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_get_span_tree_response.py">TelemetryGetSpanTreeResponse</a></code>
- <code title="get /v1/telemetry/traces/{trace_id}">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">get_trace</a>(trace_id) -> <a href="./src/llama_stack_client/types/trace.py">Trace</a></code>
- <code title="post /v1/telemetry/events">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">log_event</a>(\*\*<a href="src/llama_stack_client/types/telemetry_log_event_params.py">params</a>) -> None</code>
- <code title="get /v1/telemetry/spans">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">query_spans</a>(\*\*<a href="src/llama_stack_client/types/telemetry_query_spans_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_query_spans_response.py">TelemetryQuerySpansResponse</a></code>
- <code title="get /v1/telemetry/traces">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">query_traces</a>(\*\*<a href="src/llama_stack_client/types/telemetry_query_traces_params.py">params</a>) -> <a href="./src/llama_stack_client/types/telemetry_query_traces_response.py">TelemetryQueryTracesResponse</a></code>
- <code title="post /v1/telemetry/spans/export">client.telemetry.<a href="./src/llama_stack_client/resources/telemetry.py">save_spans_to_dataset</a>(\*\*<a href="src/llama_stack_client/types/telemetry_save_spans_to_dataset_params.py">params</a>) -> None</code>
## Datasetio
Types:
```python
from llama_stack_client.types import PaginatedRowsResult
```
Methods:
- <code title="post /v1/datasetio/rows">client.datasetio.<a href="./src/llama_stack_client/resources/datasetio.py">append_rows</a>(\*\*<a href="src/llama_stack_client/types/datasetio_append_rows_params.py">params</a>) -> None</code>
- <code title="get /v1/datasetio/rows">client.datasetio.<a href="./src/llama_stack_client/resources/datasetio.py">get_rows_paginated</a>(\*\*<a href="src/llama_stack_client/types/datasetio_get_rows_paginated_params.py">params</a>) -> <a href="./src/llama_stack_client/types/paginated_rows_result.py">PaginatedRowsResult</a></code>
## Scoring
Types:
```python
from llama_stack_client.types import ScoringScoreResponse, ScoringScoreBatchResponse
```
Methods:
- <code title="post /v1/scoring/score">client.scoring.<a href="./src/llama_stack_client/resources/scoring.py">score</a>(\*\*<a href="src/llama_stack_client/types/scoring_score_params.py">params</a>) -> <a href="./src/llama_stack_client/types/scoring_score_response.py">ScoringScoreResponse</a></code>
- <code title="post /v1/scoring/score-batch">client.scoring.<a href="./src/llama_stack_client/resources/scoring.py">score_batch</a>(\*\*<a href="src/llama_stack_client/types/scoring_score_batch_params.py">params</a>) -> <a href="./src/llama_stack_client/types/scoring_score_batch_response.py">ScoringScoreBatchResponse</a></code>
## ScoringFunctions
Types:
```python
from llama_stack_client.types import (
ListScoringFunctionsResponse,
ScoringFn,
ScoringFunctionListResponse,
)
```
Methods:
- <code title="get /v1/scoring-functions/{scoring_fn_id}">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">retrieve</a>(scoring_fn_id) -> <a href="./src/llama_stack_client/types/scoring_fn.py">Optional[ScoringFn]</a></code>
- <code title="get /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">list</a>() -> <a href="./src/llama_stack_client/types/scoring_function_list_response.py">ScoringFunctionListResponse</a></code>
- <code title="post /v1/scoring-functions">client.scoring_functions.<a href="./src/llama_stack_client/resources/scoring_functions.py">register</a>(\*\*<a href="src/llama_stack_client/types/scoring_function_register_params.py">params</a>) -> None</code>
## EvalTasks
Types:
```python
from llama_stack_client.types import EvalTask, ListEvalTasksResponse, EvalTaskListResponse
```
Methods:
- <code title="get /v1/eval-tasks/{eval_task_id}">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">retrieve</a>(eval_task_id) -> <a href="./src/llama_stack_client/types/eval_task.py">Optional[EvalTask]</a></code>
- <code title="get /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">list</a>() -> <a href="./src/llama_stack_client/types/eval_task_list_response.py">EvalTaskListResponse</a></code>
- <code title="post /v1/eval-tasks">client.eval_tasks.<a href="./src/llama_stack_client/resources/eval_tasks.py">register</a>(\*\*<a href="src/llama_stack_client/types/eval_task_register_params.py">params</a>) -> None</code>

View file

@ -1,41 +0,0 @@
# Llama Stack Developer Cookbook
Based on your developer needs, below are references to guides to help you get started.
### Hosted Llama Stack Endpoint
* Developer Need: I want to connect to a Llama Stack endpoint to build my applications.
* Effort: 1min
* Guide:
- Checkout our [DeepLearning course](https://www.deeplearning.ai/short-courses/introducing-multimodal-llama-3-2) on building with Llama Stack apps on pre-hosted Llama Stack endpoint.
### Local meta-reference Llama Stack Server
* Developer Need: I want to start a local Llama Stack server with my GPU using meta-reference implementations.
* Effort: 5min
* Guide:
- Please see our [meta-reference-gpu](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) on starting up a meta-reference Llama Stack server.
### Llama Stack Server with Remote Providers
* Developer need: I want a Llama Stack distribution with a remote provider.
* Effort: 10min
* Guide
- Please see our [Distributions Guide](https://llama-stack.readthedocs.io/en/latest/concepts/index.html#distributions) on starting up distributions with remote providers.
### On-Device (iOS) Llama Stack
* Developer Need: I want to use Llama Stack on-Device
* Effort: 1.5hr
* Guide:
- Please see our [iOS Llama Stack SDK](./ios_sdk.md) implementations
### Assemble your own Llama Stack Distribution
* Developer Need: I want to assemble my own distribution with API providers to my likings
* Effort: 30min
* Guide
- Please see our [Building Distribution](./building_distro.md) guide for assembling your own Llama Stack distribution with your choice of API providers.
### Adding a New API Provider
* Developer Need: I want to add a new API provider to Llama Stack.
* Effort: 3hr
* Guide
- Please see our [Adding a New API Provider](https://llama-stack.readthedocs.io/en/latest/contributing/new_api_provider.html) guide for adding a new API provider.

View file

@ -33,7 +33,6 @@ from llama_stack.apis.inference import (
ToolResponseMessage, ToolResponseMessage,
UserMessage, UserMessage,
) )
from llama_stack.apis.memory import MemoryBank
from llama_stack.apis.safety import SafetyViolation from llama_stack.apis.safety import SafetyViolation
from llama_stack.apis.tools import ToolDef from llama_stack.apis.tools import ToolDef
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@ -89,7 +88,7 @@ class MemoryRetrievalStep(StepCommon):
step_type: Literal[StepType.memory_retrieval.value] = ( step_type: Literal[StepType.memory_retrieval.value] = (
StepType.memory_retrieval.value StepType.memory_retrieval.value
) )
memory_bank_ids: List[str] vector_db_ids: str
inserted_context: InterleavedContent inserted_context: InterleavedContent
@ -133,8 +132,6 @@ class Session(BaseModel):
turns: List[Turn] turns: List[Turn]
started_at: datetime started_at: datetime
memory_bank: Optional[MemoryBank] = None
class AgentToolGroupWithArgs(BaseModel): class AgentToolGroupWithArgs(BaseModel):
name: str name: str
@ -158,9 +155,7 @@ class AgentConfigCommon(BaseModel):
toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list) toolgroups: Optional[List[AgentToolGroup]] = Field(default_factory=list)
client_tools: Optional[List[ToolDef]] = Field(default_factory=list) client_tools: Optional[List[ToolDef]] = Field(default_factory=list)
tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto) tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
tool_prompt_format: Optional[ToolPromptFormat] = Field( tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
default=ToolPromptFormat.json
)
max_infer_iters: int = 10 max_infer_iters: int = 10
@ -234,11 +229,8 @@ class AgentTurnResponseTurnCompletePayload(BaseModel):
turn: Turn turn: Turn
@json_schema_type AgentTurnResponseEventPayload = register_schema(
class AgentTurnResponseEvent(BaseModel): Annotated[
"""Streamed agent execution response."""
payload: Annotated[
Union[ Union[
AgentTurnResponseStepStartPayload, AgentTurnResponseStepStartPayload,
AgentTurnResponseStepProgressPayload, AgentTurnResponseStepProgressPayload,
@ -247,7 +239,14 @@ class AgentTurnResponseEvent(BaseModel):
AgentTurnResponseTurnCompletePayload, AgentTurnResponseTurnCompletePayload,
], ],
Field(discriminator="event_type"), Field(discriminator="event_type"),
] ],
name="AgentTurnResponseEventPayload",
)
@json_schema_type
class AgentTurnResponseEvent(BaseModel):
payload: AgentTurnResponseEventPayload
@json_schema_type @json_schema_type

View file

@ -137,7 +137,7 @@ class EventLogger:
event, event,
LogEvent( LogEvent(
role=None, role=None,
content=delta.content, content=delta.tool_call,
end="", end="",
color="cyan", color="cyan",
), ),
@ -208,7 +208,7 @@ class EventLogger:
): ):
details = event.payload.step_details details = event.payload.step_details
inserted_context = interleaved_content_as_str(details.inserted_context) inserted_context = interleaved_content_as_str(details.inserted_context)
content = f"fetched {len(inserted_context)} bytes from {details.memory_bank_ids}" content = f"fetched {len(inserted_context)} bytes from {details.vector_db_ids}"
yield ( yield (
event, event,

View file

@ -38,8 +38,9 @@ class _URLOrData(BaseModel):
@json_schema_type @json_schema_type
class ImageContentItem(_URLOrData): class ImageContentItem(BaseModel):
type: Literal["image"] = "image" type: Literal["image"] = "image"
image: _URLOrData
@json_schema_type @json_schema_type
@ -73,7 +74,7 @@ class TextDelta(BaseModel):
@json_schema_type @json_schema_type
class ImageDelta(BaseModel): class ImageDelta(BaseModel):
type: Literal["image"] = "image" type: Literal["image"] = "image"
data: bytes image: bytes
@json_schema_type @json_schema_type
@ -91,7 +92,7 @@ class ToolCallDelta(BaseModel):
# you either send an in-progress tool call so the client can stream a long # you either send an in-progress tool call so the client can stream a long
# code generation or you send the final parsed tool call at the end of the # code generation or you send the final parsed tool call at the end of the
# stream # stream
content: Union[str, ToolCall] tool_call: Union[str, ToolCall]
parse_status: ToolCallParseStatus parse_status: ToolCallParseStatus

View file

@ -0,0 +1,35 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from enum import Enum
from llama_models.schema_utils import json_schema_type
@json_schema_type
class Api(Enum):
inference = "inference"
safety = "safety"
agents = "agents"
vector_io = "vector_io"
datasetio = "datasetio"
scoring = "scoring"
eval = "eval"
post_training = "post_training"
tool_runtime = "tool_runtime"
telemetry = "telemetry"
models = "models"
shields = "shields"
vector_dbs = "vector_dbs"
datasets = "datasets"
scoring_functions = "scoring_functions"
eval_tasks = "eval_tasks"
tool_groups = "tool_groups"
# built-in API
inspect = "inspect"

View file

@ -6,7 +6,7 @@
from typing import Any, Dict, List, Literal, Optional, Protocol, Union from typing import Any, Dict, List, Literal, Optional, Protocol, Union
from llama_models.schema_utils import json_schema_type, webmethod from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing_extensions import Annotated from typing_extensions import Annotated
@ -31,9 +31,10 @@ class AgentCandidate(BaseModel):
config: AgentConfig config: AgentConfig
EvalCandidate = Annotated[ EvalCandidate = register_schema(
Union[ModelCandidate, AgentCandidate], Field(discriminator="type") Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")],
] name="EvalCandidate",
)
@json_schema_type @json_schema_type
@ -61,9 +62,12 @@ class AppEvalTaskConfig(BaseModel):
# we could optinally add any specific dataset config here # we could optinally add any specific dataset config here
EvalTaskConfig = Annotated[ EvalTaskConfig = register_schema(
Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type") Annotated[
] Union[BenchmarkEvalTaskConfig, AppEvalTaskConfig], Field(discriminator="type")
],
name="EvalTaskConfig",
)
@json_schema_type @json_schema_type

View file

@ -157,11 +157,13 @@ class ChatCompletionResponseEvent(BaseModel):
stop_reason: Optional[StopReason] = None stop_reason: Optional[StopReason] = None
@json_schema_type
class ResponseFormatType(Enum): class ResponseFormatType(Enum):
json_schema = "json_schema" json_schema = "json_schema"
grammar = "grammar" grammar = "grammar"
@json_schema_type
class JsonSchemaResponseFormat(BaseModel): class JsonSchemaResponseFormat(BaseModel):
type: Literal[ResponseFormatType.json_schema.value] = ( type: Literal[ResponseFormatType.json_schema.value] = (
ResponseFormatType.json_schema.value ResponseFormatType.json_schema.value
@ -169,6 +171,7 @@ class JsonSchemaResponseFormat(BaseModel):
json_schema: Dict[str, Any] json_schema: Dict[str, Any]
@json_schema_type
class GrammarResponseFormat(BaseModel): class GrammarResponseFormat(BaseModel):
type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
bnf: Dict[str, Any] bnf: Dict[str, Any]

View file

@ -1,161 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from enum import Enum
from typing import (
Annotated,
List,
Literal,
Optional,
Protocol,
runtime_checkable,
Union,
)
from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field
from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@json_schema_type
class MemoryBankType(Enum):
vector = "vector"
keyvalue = "keyvalue"
keyword = "keyword"
graph = "graph"
# define params for each type of memory bank, this leads to a tagged union
# accepted as input from the API or from the config.
@json_schema_type
class VectorMemoryBankParams(BaseModel):
memory_bank_type: Literal[MemoryBankType.vector.value] = MemoryBankType.vector.value
embedding_model: str
chunk_size_in_tokens: int
overlap_size_in_tokens: Optional[int] = None
@json_schema_type
class KeyValueMemoryBankParams(BaseModel):
memory_bank_type: Literal[MemoryBankType.keyvalue.value] = (
MemoryBankType.keyvalue.value
)
@json_schema_type
class KeywordMemoryBankParams(BaseModel):
memory_bank_type: Literal[MemoryBankType.keyword.value] = (
MemoryBankType.keyword.value
)
@json_schema_type
class GraphMemoryBankParams(BaseModel):
memory_bank_type: Literal[MemoryBankType.graph.value] = MemoryBankType.graph.value
BankParams = Annotated[
Union[
VectorMemoryBankParams,
KeyValueMemoryBankParams,
KeywordMemoryBankParams,
GraphMemoryBankParams,
],
Field(discriminator="memory_bank_type"),
]
# Some common functionality for memory banks.
class MemoryBankResourceMixin(Resource):
type: Literal[ResourceType.memory_bank.value] = ResourceType.memory_bank.value
@property
def memory_bank_id(self) -> str:
return self.identifier
@property
def provider_memory_bank_id(self) -> str:
return self.provider_resource_id
@json_schema_type
class VectorMemoryBank(MemoryBankResourceMixin):
memory_bank_type: Literal[MemoryBankType.vector.value] = MemoryBankType.vector.value
embedding_model: str
chunk_size_in_tokens: int
embedding_dimension: Optional[int] = 384 # default to minilm-l6-v2
overlap_size_in_tokens: Optional[int] = None
@json_schema_type
class KeyValueMemoryBank(MemoryBankResourceMixin):
memory_bank_type: Literal[MemoryBankType.keyvalue.value] = (
MemoryBankType.keyvalue.value
)
# TODO: KeyValue and Keyword are so similar in name, oof. Get a better naming convention.
@json_schema_type
class KeywordMemoryBank(MemoryBankResourceMixin):
memory_bank_type: Literal[MemoryBankType.keyword.value] = (
MemoryBankType.keyword.value
)
@json_schema_type
class GraphMemoryBank(MemoryBankResourceMixin):
memory_bank_type: Literal[MemoryBankType.graph.value] = MemoryBankType.graph.value
MemoryBank = register_schema(
Annotated[
Union[
VectorMemoryBank,
KeyValueMemoryBank,
KeywordMemoryBank,
GraphMemoryBank,
],
Field(discriminator="memory_bank_type"),
],
name="MemoryBank",
)
class MemoryBankInput(BaseModel):
memory_bank_id: str
params: BankParams
provider_memory_bank_id: Optional[str] = None
class ListMemoryBanksResponse(BaseModel):
data: List[MemoryBank]
@runtime_checkable
@trace_protocol
class MemoryBanks(Protocol):
@webmethod(route="/memory-banks", method="GET")
async def list_memory_banks(self) -> ListMemoryBanksResponse: ...
@webmethod(route="/memory-banks/{memory_bank_id}", method="GET")
async def get_memory_bank(
self,
memory_bank_id: str,
) -> Optional[MemoryBank]: ...
@webmethod(route="/memory-banks", method="POST")
async def register_memory_bank(
self,
memory_bank_id: str,
params: BankParams,
provider_id: Optional[str] = None,
provider_memory_bank_id: Optional[str] = None,
) -> MemoryBank: ...
@webmethod(route="/memory-banks/{memory_bank_id}", method="DELETE")
async def unregister_memory_bank(self, memory_bank_id: str) -> None: ...

View file

@ -8,7 +8,7 @@ from datetime import datetime
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Protocol, Union from typing import Any, Dict, List, Literal, Optional, Protocol, Union
from llama_models.schema_utils import json_schema_type, webmethod from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing_extensions import Annotated from typing_extensions import Annotated
@ -88,9 +88,12 @@ class QATFinetuningConfig(BaseModel):
group_size: int group_size: int
AlgorithmConfig = Annotated[ AlgorithmConfig = register_schema(
Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type") Annotated[
] Union[LoraFinetuningConfig, QATFinetuningConfig], Field(discriminator="type")
],
name="AlgorithmConfig",
)
@json_schema_type @json_schema_type

View file

@ -14,7 +14,7 @@ from pydantic import BaseModel, Field
class ResourceType(Enum): class ResourceType(Enum):
model = "model" model = "model"
shield = "shield" shield = "shield"
memory_bank = "memory_bank" vector_db = "vector_db"
dataset = "dataset" dataset = "dataset"
scoring_function = "scoring_function" scoring_function = "scoring_function"
eval_task = "eval_task" eval_task = "eval_task"
@ -37,5 +37,5 @@ class Resource(BaseModel):
provider_id: str = Field(description="ID of the provider that owns this resource") provider_id: str = Field(description="ID of the provider that owns this resource")
type: ResourceType = Field( type: ResourceType = Field(
description="Type of resource (e.g. 'model', 'shield', 'memory_bank', etc.)" description="Type of resource (e.g. 'model', 'shield', 'vector_db', etc.)"
) )

View file

@ -16,7 +16,7 @@ from typing import (
Union, Union,
) )
from llama_models.schema_utils import json_schema_type, webmethod from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing_extensions import Annotated from typing_extensions import Annotated
@ -82,14 +82,17 @@ class BasicScoringFnParams(BaseModel):
) )
ScoringFnParams = Annotated[ ScoringFnParams = register_schema(
Union[ Annotated[
LLMAsJudgeScoringFnParams, Union[
RegexParserScoringFnParams, LLMAsJudgeScoringFnParams,
BasicScoringFnParams, RegexParserScoringFnParams,
BasicScoringFnParams,
],
Field(discriminator="type"),
], ],
Field(discriminator="type"), name="ScoringFnParams",
] )
class CommonScoringFnFields(BaseModel): class CommonScoringFnFields(BaseModel):

View file

@ -17,7 +17,7 @@ from typing import (
Union, Union,
) )
from llama_models.schema_utils import json_schema_type, webmethod from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing_extensions import Annotated from typing_extensions import Annotated
@ -115,13 +115,16 @@ class SpanEndPayload(BaseModel):
status: SpanStatus status: SpanStatus
StructuredLogPayload = Annotated[ StructuredLogPayload = register_schema(
Union[ Annotated[
SpanStartPayload, Union[
SpanEndPayload, SpanStartPayload,
SpanEndPayload,
],
Field(discriminator="type"),
], ],
Field(discriminator="type"), name="StructuredLogPayload",
] )
@json_schema_type @json_schema_type
@ -130,14 +133,17 @@ class StructuredLogEvent(EventCommon):
payload: StructuredLogPayload payload: StructuredLogPayload
Event = Annotated[ Event = register_schema(
Union[ Annotated[
UnstructuredLogEvent, Union[
MetricEvent, UnstructuredLogEvent,
StructuredLogEvent, MetricEvent,
StructuredLogEvent,
],
Field(discriminator="type"),
], ],
Field(discriminator="type"), name="Event",
] )
@json_schema_type @json_schema_type

View file

@ -5,3 +5,4 @@
# the root directory of this source tree. # the root directory of this source tree.
from .tools import * # noqa: F401 F403 from .tools import * # noqa: F401 F403
from .rag_tool import * # noqa: F401 F403

View file

@ -0,0 +1,95 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
from llama_models.schema_utils import json_schema_type, register_schema, webmethod
from pydantic import BaseModel, Field
from typing_extensions import Annotated, Protocol, runtime_checkable
from llama_stack.apis.common.content_types import InterleavedContent, URL
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@json_schema_type
class RAGDocument(BaseModel):
document_id: str
content: InterleavedContent | URL
mime_type: str | None = None
metadata: Dict[str, Any] = Field(default_factory=dict)
@json_schema_type
class RAGQueryResult(BaseModel):
content: Optional[InterleavedContent] = None
@json_schema_type
class RAGQueryGenerator(Enum):
default = "default"
llm = "llm"
custom = "custom"
@json_schema_type
class DefaultRAGQueryGeneratorConfig(BaseModel):
type: Literal["default"] = "default"
separator: str = " "
@json_schema_type
class LLMRAGQueryGeneratorConfig(BaseModel):
type: Literal["llm"] = "llm"
model: str
template: str
RAGQueryGeneratorConfig = register_schema(
Annotated[
Union[
DefaultRAGQueryGeneratorConfig,
LLMRAGQueryGeneratorConfig,
],
Field(discriminator="type"),
],
name="RAGQueryGeneratorConfig",
)
@json_schema_type
class RAGQueryConfig(BaseModel):
# This config defines how a query is generated using the messages
# for memory bank retrieval.
query_generator_config: RAGQueryGeneratorConfig = Field(
default=DefaultRAGQueryGeneratorConfig()
)
max_tokens_in_context: int = 4096
max_chunks: int = 5
@runtime_checkable
@trace_protocol
class RAGToolRuntime(Protocol):
@webmethod(route="/tool-runtime/rag-tool/insert", method="POST")
async def insert(
self,
documents: List[RAGDocument],
vector_db_id: str,
chunk_size_in_tokens: int = 512,
) -> None:
"""Index documents so they can be used by the RAG system"""
...
@webmethod(route="/tool-runtime/rag-tool/query", method="POST")
async def query(
self,
content: InterleavedContent,
vector_db_ids: List[str],
query_config: Optional[RAGQueryConfig] = None,
) -> RAGQueryResult:
"""Query the RAG system for context; typically invoked by the agent"""
...

View file

@ -15,6 +15,8 @@ from llama_stack.apis.common.content_types import InterleavedContent, URL
from llama_stack.apis.resource import Resource, ResourceType from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
from .rag_tool import RAGToolRuntime
@json_schema_type @json_schema_type
class ToolParameter(BaseModel): class ToolParameter(BaseModel):
@ -130,11 +132,17 @@ class ToolGroups(Protocol):
... ...
class SpecialToolGroup(Enum):
rag_tool = "rag_tool"
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class ToolRuntime(Protocol): class ToolRuntime(Protocol):
tool_store: ToolStore tool_store: ToolStore
rag_tool: RAGToolRuntime
# TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed. # TODO: This needs to be renamed once OPEN API generator name conflict issue is fixed.
@webmethod(route="/tool-runtime/list-tools", method="GET") @webmethod(route="/tool-runtime/list-tools", method="GET")
async def list_runtime_tools( async def list_runtime_tools(
@ -143,7 +151,7 @@ class ToolRuntime(Protocol):
@webmethod(route="/tool-runtime/invoke", method="POST") @webmethod(route="/tool-runtime/invoke", method="POST")
async def invoke_tool( async def invoke_tool(
self, tool_name: str, args: Dict[str, Any] self, tool_name: str, kwargs: Dict[str, Any]
) -> ToolInvocationResult: ) -> ToolInvocationResult:
"""Run a tool with the given arguments""" """Run a tool with the given arguments"""
... ...

View file

@ -4,4 +4,4 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from .memory_banks import * # noqa: F401 F403 from .vector_dbs import * # noqa: F401 F403

View file

@ -0,0 +1,66 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
from typing import List, Literal, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel
from llama_stack.apis.resource import Resource, ResourceType
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@json_schema_type
class VectorDB(Resource):
type: Literal[ResourceType.vector_db.value] = ResourceType.vector_db.value
embedding_model: str
embedding_dimension: int
@property
def vector_db_id(self) -> str:
return self.identifier
@property
def provider_vector_db_id(self) -> str:
return self.provider_resource_id
class VectorDBInput(BaseModel):
vector_db_id: str
embedding_model: str
embedding_dimension: int
provider_vector_db_id: Optional[str] = None
class ListVectorDBsResponse(BaseModel):
data: List[VectorDB]
@runtime_checkable
@trace_protocol
class VectorDBs(Protocol):
@webmethod(route="/vector-dbs", method="GET")
async def list_vector_dbs(self) -> ListVectorDBsResponse: ...
@webmethod(route="/vector-dbs/{vector_db_id}", method="GET")
async def get_vector_db(
self,
vector_db_id: str,
) -> Optional[VectorDB]: ...
@webmethod(route="/vector-dbs", method="POST")
async def register_vector_db(
self,
vector_db_id: str,
embedding_model: str,
embedding_dimension: Optional[int] = 384,
provider_id: Optional[str] = None,
provider_vector_db_id: Optional[str] = None,
) -> VectorDB: ...
@webmethod(route="/vector-dbs/{vector_db_id}", method="DELETE")
async def unregister_vector_db(self, vector_db_id: str) -> None: ...

View file

@ -4,4 +4,4 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
from .memory import * # noqa: F401 F403 from .vector_io import * # noqa: F401 F403

View file

@ -13,55 +13,45 @@ from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
from llama_models.schema_utils import json_schema_type, webmethod from llama_models.schema_utils import json_schema_type, webmethod
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from llama_stack.apis.common.content_types import URL
from llama_stack.apis.inference import InterleavedContent from llama_stack.apis.inference import InterleavedContent
from llama_stack.apis.memory_banks import MemoryBank from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol from llama_stack.providers.utils.telemetry.trace_protocol import trace_protocol
@json_schema_type
class MemoryBankDocument(BaseModel):
document_id: str
content: InterleavedContent | URL
mime_type: str | None = None
metadata: Dict[str, Any] = Field(default_factory=dict)
class Chunk(BaseModel): class Chunk(BaseModel):
content: InterleavedContent content: InterleavedContent
token_count: int metadata: Dict[str, Any] = Field(default_factory=dict)
document_id: str
@json_schema_type @json_schema_type
class QueryDocumentsResponse(BaseModel): class QueryChunksResponse(BaseModel):
chunks: List[Chunk] chunks: List[Chunk]
scores: List[float] scores: List[float]
class MemoryBankStore(Protocol): class VectorDBStore(Protocol):
def get_memory_bank(self, bank_id: str) -> Optional[MemoryBank]: ... def get_vector_db(self, vector_db_id: str) -> Optional[VectorDB]: ...
@runtime_checkable @runtime_checkable
@trace_protocol @trace_protocol
class Memory(Protocol): class VectorIO(Protocol):
memory_bank_store: MemoryBankStore vector_db_store: VectorDBStore
# this will just block now until documents are inserted, but it should # this will just block now until chunks are inserted, but it should
# probably return a Job instance which can be polled for completion # probably return a Job instance which can be polled for completion
@webmethod(route="/memory/insert", method="POST") @webmethod(route="/vector-io/insert", method="POST")
async def insert_documents( async def insert_chunks(
self, self,
bank_id: str, vector_db_id: str,
documents: List[MemoryBankDocument], chunks: List[Chunk],
ttl_seconds: Optional[int] = None, ttl_seconds: Optional[int] = None,
) -> None: ... ) -> None: ...
@webmethod(route="/memory/query", method="POST") @webmethod(route="/vector-io/query", method="POST")
async def query_documents( async def query_chunks(
self, self,
bank_id: str, vector_db_id: str,
query: InterleavedContent, query: InterleavedContent,
params: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse: ... ) -> QueryChunksResponse: ...

View file

@ -115,6 +115,8 @@ def run_stack_build_command(
f"Using conda environment {image_name}", f"Using conda environment {image_name}",
color="green", color="green",
) )
else:
image_name = f"llamastack-{name}"
cprint( cprint(
textwrap.dedent( textwrap.dedent(
@ -171,19 +173,30 @@ def run_stack_build_command(
) )
return return
_run_stack_build_command_from_build_config(build_config, image_name=image_name) if build_config.image_type == ImageType.container.value and not args.image_name:
cprint(
"Please specify --image-name when building a container from a config file",
color="red",
)
return
_run_stack_build_command_from_build_config(
build_config, image_name=image_name, config_path=args.config
)
def _generate_run_config( def _generate_run_config(
build_config: BuildConfig, build_dir: Path, image_name: str build_config: BuildConfig,
build_dir: Path,
image_name: str,
) -> None: ) -> None:
""" """
Generate a run.yaml template file for user to edit from a build.yaml file Generate a run.yaml template file for user to edit from a build.yaml file
""" """
apis = list(build_config.distribution_spec.providers.keys()) apis = list(build_config.distribution_spec.providers.keys())
run_config = StackRunConfig( run_config = StackRunConfig(
docker_image=( container_image=(
image_name if build_config.image_type == ImageType.docker.value else None image_name if build_config.image_type == ImageType.container.value else None
), ),
image_name=image_name, image_name=image_name,
apis=apis, apis=apis,
@ -227,8 +240,9 @@ def _generate_run_config(
to_write = json.loads(run_config.model_dump_json()) to_write = json.loads(run_config.model_dump_json())
f.write(yaml.dump(to_write, sort_keys=False)) f.write(yaml.dump(to_write, sort_keys=False))
# this path is only invoked when no template is provided
cprint( cprint(
f"You can now edit {run_config_file} and run `llama stack run {image_name}`", f"You can now run your stack with `llama stack run {run_config_file}`",
color="green", color="green",
) )
@ -237,8 +251,9 @@ def _run_stack_build_command_from_build_config(
build_config: BuildConfig, build_config: BuildConfig,
image_name: Optional[str] = None, image_name: Optional[str] = None,
template_name: Optional[str] = None, template_name: Optional[str] = None,
config_path: Optional[str] = None,
) -> None: ) -> None:
if build_config.image_type == ImageType.docker.value: if build_config.image_type == ImageType.container.value:
if template_name: if template_name:
image_name = f"distribution-{template_name}" image_name = f"distribution-{template_name}"
else: else:
@ -263,7 +278,10 @@ def _run_stack_build_command_from_build_config(
f.write(yaml.dump(to_write, sort_keys=False)) f.write(yaml.dump(to_write, sort_keys=False))
return_code = build_image( return_code = build_image(
build_config, build_file_path, image_name, template_name=template_name build_config,
build_file_path,
image_name,
template_or_config=template_name or config_path,
) )
if return_code != 0: if return_code != 0:
return return
@ -277,7 +295,7 @@ def _run_stack_build_command_from_build_config(
with importlib.resources.as_file(template_path) as path: with importlib.resources.as_file(template_path) as path:
run_config_file = build_dir / f"{template_name}-run.yaml" run_config_file = build_dir / f"{template_name}-run.yaml"
shutil.copy(path, run_config_file) shutil.copy(path, run_config_file)
# Find all ${env.VARIABLE} patterns
cprint("Build Successful!", color="green") cprint("Build Successful!", color="green")
else: else:
_generate_run_config(build_config, build_dir, image_name) _generate_run_config(build_config, build_dir, image_name)

View file

@ -47,8 +47,8 @@ class StackBuild(Subcommand):
self.parser.add_argument( self.parser.add_argument(
"--image-type", "--image-type",
type=str, type=str,
help="Image Type to use for the build. This can be either conda or docker. If not specified, will use the image type from the template config.", help="Image Type to use for the build. This can be either conda or container or venv. If not specified, will use the image type from the template config.",
choices=["conda", "docker", "venv"], choices=["conda", "container", "venv"],
default="conda", default="conda",
) )

View file

@ -27,7 +27,7 @@ class StackConfigure(Subcommand):
self.parser.add_argument( self.parser.add_argument(
"config", "config",
type=str, type=str,
help="Path to the build config file (e.g. ~/.llama/builds/<image_type>/<name>-build.yaml). For docker, this could also be the name of the docker image. ", help="Path to the build config file (e.g. ~/.llama/builds/<image_type>/<name>-build.yaml). For container, this could also be the name of the container image. ",
) )
self.parser.add_argument( self.parser.add_argument(

View file

@ -78,12 +78,15 @@ class StackRun(Subcommand):
config_file = Path(args.config) config_file = Path(args.config)
has_yaml_suffix = args.config.endswith(".yaml") has_yaml_suffix = args.config.endswith(".yaml")
template_name = None
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if this is a template # check if this is a template
config_file = ( config_file = (
Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml" Path(REPO_ROOT) / "llama_stack" / "templates" / args.config / "run.yaml"
) )
if config_file.exists():
template_name = args.config
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to conda dir # check if it's a build config saved to conda dir
@ -92,9 +95,9 @@ class StackRun(Subcommand):
) )
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
# check if it's a build config saved to docker dir # check if it's a build config saved to container dir
config_file = Path( config_file = Path(
BUILDS_BASE_DIR / ImageType.docker.value / f"{args.config}-run.yaml" BUILDS_BASE_DIR / ImageType.container.value / f"{args.config}-run.yaml"
) )
if not config_file.exists() and not has_yaml_suffix: if not config_file.exists() and not has_yaml_suffix:
@ -115,12 +118,17 @@ class StackRun(Subcommand):
config_dict = yaml.safe_load(config_file.read_text()) config_dict = yaml.safe_load(config_file.read_text())
config = parse_and_maybe_upgrade_config(config_dict) config = parse_and_maybe_upgrade_config(config_dict)
if config.docker_image: if config.container_image:
script = ( script = (
importlib.resources.files("llama_stack") importlib.resources.files("llama_stack")
/ "distribution/start_container.sh" / "distribution/start_container.sh"
) )
run_args = [script, config.docker_image] image_name = (
f"distribution-{template_name}"
if template_name
else config.container_image
)
run_args = [script, image_name]
else: else:
current_conda_env = os.environ.get("CONDA_DEFAULT_ENV") current_conda_env = os.environ.get("CONDA_DEFAULT_ENV")
image_name = args.image_name or current_conda_env image_name = args.image_name or current_conda_env

View file

@ -10,7 +10,7 @@ import sys
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List
from pydantic import BaseModel from pydantic import BaseModel
from termcolor import cprint from termcolor import cprint
@ -38,7 +38,7 @@ SERVER_DEPENDENCIES = [
class ImageType(Enum): class ImageType(Enum):
docker = "docker" container = "container"
conda = "conda" conda = "conda"
venv = "venv" venv = "venv"
@ -77,8 +77,8 @@ def get_provider_dependencies(
provider_spec = providers_for_api[provider_type] provider_spec = providers_for_api[provider_type]
deps.extend(provider_spec.pip_packages) deps.extend(provider_spec.pip_packages)
if provider_spec.docker_image: if provider_spec.container_image:
raise ValueError("A stack's dependencies cannot have a docker image") raise ValueError("A stack's dependencies cannot have a container image")
normal_deps = [] normal_deps = []
special_deps = [] special_deps = []
@ -107,25 +107,28 @@ def build_image(
build_config: BuildConfig, build_config: BuildConfig,
build_file_path: Path, build_file_path: Path,
image_name: str, image_name: str,
template_name: Optional[str] = None, template_or_config: str,
): ):
docker_image = build_config.distribution_spec.docker_image or "python:3.10-slim" container_base = (
build_config.distribution_spec.container_image or "python:3.10-slim"
)
normal_deps, special_deps = get_provider_dependencies( normal_deps, special_deps = get_provider_dependencies(
build_config.distribution_spec.providers build_config.distribution_spec.providers
) )
normal_deps += SERVER_DEPENDENCIES normal_deps += SERVER_DEPENDENCIES
if build_config.image_type == ImageType.docker.value: if build_config.image_type == ImageType.container.value:
script = str( script = str(
importlib.resources.files("llama_stack") / "distribution/build_container.sh" importlib.resources.files("llama_stack") / "distribution/build_container.sh"
) )
args = [ args = [
script, script,
template_or_config,
image_name, image_name,
docker_image, container_base,
str(build_file_path), str(build_file_path),
str(BUILDS_BASE_DIR / ImageType.docker.value), str(BUILDS_BASE_DIR / ImageType.container.value),
" ".join(normal_deps), " ".join(normal_deps),
] ]
elif build_config.image_type == ImageType.conda.value: elif build_config.image_type == ImageType.conda.value:

View file

@ -12,22 +12,22 @@ TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
PYPI_VERSION=${PYPI_VERSION:-} PYPI_VERSION=${PYPI_VERSION:-}
BUILD_PLATFORM=${BUILD_PLATFORM:-} BUILD_PLATFORM=${BUILD_PLATFORM:-}
if [ "$#" -lt 4 ]; then if [ "$#" -lt 6 ]; then
echo "Usage: $0 <build_name> <docker_base> <pip_dependencies> [<special_pip_deps>]" >&2 # This only works for templates
echo "Example: $0 my-fastapi-app python:3.9-slim 'fastapi uvicorn' " >&2 echo "Usage: $0 <template_or_config> <image_name> <container_base> <build_file_path> <host_build_dir> <pip_dependencies> [<special_pip_deps>]" >&2
exit 1 exit 1
fi fi
special_pip_deps="$6"
set -euo pipefail set -euo pipefail
build_name="$1" template_or_config="$1"
image_name="distribution-$build_name" image_name="$2"
docker_base=$2 container_base="$3"
build_file_path=$3 build_file_path="$4"
host_build_dir=$4 host_build_dir="$5"
pip_dependencies=$5 pip_dependencies="$6"
special_pip_deps="$7"
# Define color codes # Define color codes
RED='\033[0;31m' RED='\033[0;31m'
@ -36,14 +36,14 @@ NC='\033[0m' # No Color
SCRIPT_DIR=$(dirname "$(readlink -f "$0")") SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
REPO_DIR=$(dirname $(dirname "$SCRIPT_DIR")) REPO_DIR=$(dirname $(dirname "$SCRIPT_DIR"))
DOCKER_BINARY=${DOCKER_BINARY:-docker} CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
DOCKER_OPTS=${DOCKER_OPTS:-} CONTAINER_OPTS=${CONTAINER_OPTS:-}
TEMP_DIR=$(mktemp -d) TEMP_DIR=$(mktemp -d)
add_to_docker() { add_to_container() {
local input local input
output_file="$TEMP_DIR/Dockerfile" output_file="$TEMP_DIR/Containerfile"
if [ -t 0 ]; then if [ -t 0 ]; then
printf '%s\n' "$1" >>"$output_file" printf '%s\n' "$1" >>"$output_file"
else else
@ -53,9 +53,9 @@ add_to_docker() {
} }
# Update and install UBI9 components if UBI9 base image is used # Update and install UBI9 components if UBI9 base image is used
if [[ $docker_base == *"registry.access.redhat.com/ubi9"* ]]; then if [[ $container_base == *"registry.access.redhat.com/ubi9"* ]]; then
add_to_docker << EOF add_to_container << EOF
FROM $docker_base FROM $container_base
WORKDIR /app WORKDIR /app
RUN microdnf -y update && microdnf install -y iputils net-tools wget \ RUN microdnf -y update && microdnf install -y iputils net-tools wget \
@ -64,8 +64,8 @@ RUN microdnf -y update && microdnf install -y iputils net-tools wget \
EOF EOF
else else
add_to_docker << EOF add_to_container << EOF
FROM $docker_base FROM $container_base
WORKDIR /app WORKDIR /app
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y \
@ -82,7 +82,7 @@ fi
# Add pip dependencies first since llama-stack is what will change most often # Add pip dependencies first since llama-stack is what will change most often
# so we can reuse layers. # so we can reuse layers.
if [ -n "$pip_dependencies" ]; then if [ -n "$pip_dependencies" ]; then
add_to_docker << EOF add_to_container << EOF
RUN pip install --no-cache $pip_dependencies RUN pip install --no-cache $pip_dependencies
EOF EOF
fi fi
@ -90,7 +90,7 @@ fi
if [ -n "$special_pip_deps" ]; then if [ -n "$special_pip_deps" ]; then
IFS='#' read -ra parts <<<"$special_pip_deps" IFS='#' read -ra parts <<<"$special_pip_deps"
for part in "${parts[@]}"; do for part in "${parts[@]}"; do
add_to_docker <<EOF add_to_container <<EOF
RUN pip install --no-cache $part RUN pip install --no-cache $part
EOF EOF
done done
@ -108,16 +108,16 @@ if [ -n "$LLAMA_STACK_DIR" ]; then
# Install in editable format. We will mount the source code into the container # Install in editable format. We will mount the source code into the container
# so that changes will be reflected in the container without having to do a # so that changes will be reflected in the container without having to do a
# rebuild. This is just for development convenience. # rebuild. This is just for development convenience.
add_to_docker << EOF add_to_container << EOF
RUN pip install --no-cache -e $stack_mount RUN pip install --no-cache -e $stack_mount
EOF EOF
else else
if [ -n "$TEST_PYPI_VERSION" ]; then if [ -n "$TEST_PYPI_VERSION" ]; then
# these packages are damaged in test-pypi, so install them first # these packages are damaged in test-pypi, so install them first
add_to_docker << EOF add_to_container << EOF
RUN pip install fastapi libcst RUN pip install fastapi libcst
EOF EOF
add_to_docker << EOF add_to_container << EOF
RUN pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \ RUN pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
@ -128,7 +128,7 @@ EOF
else else
SPEC_VERSION="llama-stack" SPEC_VERSION="llama-stack"
fi fi
add_to_docker << EOF add_to_container << EOF
RUN pip install --no-cache $SPEC_VERSION RUN pip install --no-cache $SPEC_VERSION
EOF EOF
fi fi
@ -140,24 +140,26 @@ if [ -n "$LLAMA_MODELS_DIR" ]; then
exit 1 exit 1
fi fi
add_to_docker << EOF add_to_container << EOF
RUN pip uninstall -y llama-models RUN pip uninstall -y llama-models
RUN pip install --no-cache $models_mount RUN pip install --no-cache $models_mount
EOF EOF
fi fi
add_to_docker << EOF # if template_or_config ends with .yaml, it is not a template and we should not use the --template flag
if [[ "$template_or_config" != *.yaml ]]; then
# This would be good in production but for debugging flexibility lets not add it right now add_to_container << EOF
# We need a more solid production ready entrypoint.sh anyway ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "$template_or_config"]
#
ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server", "--template", "$build_name"]
EOF EOF
else
add_to_container << EOF
ENTRYPOINT ["python", "-m", "llama_stack.distribution.server.server"]
EOF
fi
printf "Dockerfile created successfully in $TEMP_DIR/Dockerfile\n\n" printf "Containerfile created successfully in $TEMP_DIR/Containerfile\n\n"
cat $TEMP_DIR/Dockerfile cat $TEMP_DIR/Containerfile
printf "\n" printf "\n"
mounts="" mounts=""
@ -170,11 +172,13 @@ fi
if command -v selinuxenabled &>/dev/null && selinuxenabled; then if command -v selinuxenabled &>/dev/null && selinuxenabled; then
# Disable SELinux labels -- we don't want to relabel the llama-stack source dir # Disable SELinux labels -- we don't want to relabel the llama-stack source dir
DOCKER_OPTS="$DOCKER_OPTS --security-opt label=disable" CONTAINER_OPTS="$CONTAINER_OPTS --security-opt label=disable"
fi fi
# Set version tag based on PyPI version # Set version tag based on PyPI version
if [ -n "$TEST_PYPI_VERSION" ]; then if [ -n "$PYPI_VERSION" ]; then
version_tag="$PYPI_VERSION"
elif [ -n "$TEST_PYPI_VERSION" ]; then
version_tag="test-$TEST_PYPI_VERSION" version_tag="test-$TEST_PYPI_VERSION"
elif [[ -n "$LLAMA_STACK_DIR" || -n "$LLAMA_MODELS_DIR" ]]; then elif [[ -n "$LLAMA_STACK_DIR" || -n "$LLAMA_MODELS_DIR" ]]; then
version_tag="dev" version_tag="dev"
@ -200,7 +204,7 @@ else
fi fi
set -x set -x
$DOCKER_BINARY build $DOCKER_OPTS $PLATFORM -t $image_tag -f "$TEMP_DIR/Dockerfile" "$REPO_DIR" $mounts $CONTAINER_BINARY build $CONTAINER_OPTS $PLATFORM -t $image_tag -f "$TEMP_DIR/Containerfile" "$REPO_DIR" $mounts
# clean up tmp/configs # clean up tmp/configs
set +x set +x

View file

@ -6,8 +6,8 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
DOCKER_BINARY=${DOCKER_BINARY:-docker} CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
DOCKER_OPTS=${DOCKER_OPTS:-} CONTAINER_OPTS=${CONTAINER_OPTS:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
set -euo pipefail set -euo pipefail
@ -24,13 +24,13 @@ if [ $# -lt 2 ]; then
exit 1 exit 1
fi fi
docker_image="$1" container_image="$1"
host_build_dir="$2" host_build_dir="$2"
container_build_dir="/app/builds" container_build_dir="/app/builds"
if command -v selinuxenabled &> /dev/null && selinuxenabled; then if command -v selinuxenabled &> /dev/null && selinuxenabled; then
# Disable SELinux labels # Disable SELinux labels
DOCKER_OPTS="$DOCKER_OPTS --security-opt label=disable" CONTAINER_OPTS="$CONTAINER_OPTS --security-opt label=disable"
fi fi
mounts="" mounts=""
@ -39,9 +39,9 @@ if [ -n "$LLAMA_STACK_DIR" ]; then
fi fi
set -x set -x
$DOCKER_BINARY run $DOCKER_OPTS -it \ $CONTAINER_BINARY run $CONTAINER_OPTS -it \
--entrypoint "/usr/local/bin/llama" \ --entrypoint "/usr/local/bin/llama" \
-v $host_build_dir:$container_build_dir \ -v $host_build_dir:$container_build_dir \
$mounts \ $mounts \
$docker_image \ $container_image \
stack configure ./llamastack-build.yaml --output-dir $container_build_dir stack configure ./llamastack-build.yaml --output-dir $container_build_dir

View file

@ -13,14 +13,14 @@ from llama_stack.apis.datasets import Dataset, DatasetInput
from llama_stack.apis.eval import Eval from llama_stack.apis.eval import Eval
from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput from llama_stack.apis.eval_tasks import EvalTask, EvalTaskInput
from llama_stack.apis.inference import Inference from llama_stack.apis.inference import Inference
from llama_stack.apis.memory import Memory
from llama_stack.apis.memory_banks import MemoryBank, MemoryBankInput
from llama_stack.apis.models import Model, ModelInput from llama_stack.apis.models import Model, ModelInput
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
from llama_stack.apis.scoring import Scoring from llama_stack.apis.scoring import Scoring
from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput
from llama_stack.apis.shields import Shield, ShieldInput from llama_stack.apis.shields import Shield, ShieldInput
from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime
from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput
from llama_stack.apis.vector_io import VectorIO
from llama_stack.providers.datatypes import Api, ProviderSpec from llama_stack.providers.datatypes import Api, ProviderSpec
from llama_stack.providers.utils.kvstore.config import KVStoreConfig from llama_stack.providers.utils.kvstore.config import KVStoreConfig
@ -34,7 +34,7 @@ RoutingKey = Union[str, List[str]]
RoutableObject = Union[ RoutableObject = Union[
Model, Model,
Shield, Shield,
MemoryBank, VectorDB,
Dataset, Dataset,
ScoringFn, ScoringFn,
EvalTask, EvalTask,
@ -47,7 +47,7 @@ RoutableObjectWithProvider = Annotated[
Union[ Union[
Model, Model,
Shield, Shield,
MemoryBank, VectorDB,
Dataset, Dataset,
ScoringFn, ScoringFn,
EvalTask, EvalTask,
@ -60,7 +60,7 @@ RoutableObjectWithProvider = Annotated[
RoutedProtocol = Union[ RoutedProtocol = Union[
Inference, Inference,
Safety, Safety,
Memory, VectorIO,
DatasetIO, DatasetIO,
Scoring, Scoring,
Eval, Eval,
@ -73,7 +73,7 @@ class AutoRoutedProviderSpec(ProviderSpec):
provider_type: str = "router" provider_type: str = "router"
config_class: str = "" config_class: str = ""
docker_image: Optional[str] = None container_image: Optional[str] = None
routing_table_api: Api routing_table_api: Api
module: str module: str
provider_data_validator: Optional[str] = Field( provider_data_validator: Optional[str] = Field(
@ -89,7 +89,7 @@ class AutoRoutedProviderSpec(ProviderSpec):
class RoutingTableProviderSpec(ProviderSpec): class RoutingTableProviderSpec(ProviderSpec):
provider_type: str = "routing_table" provider_type: str = "routing_table"
config_class: str = "" config_class: str = ""
docker_image: Optional[str] = None container_image: Optional[str] = None
router_api: Api router_api: Api
module: str module: str
@ -101,7 +101,7 @@ class DistributionSpec(BaseModel):
default="", default="",
description="Description of the distribution", description="Description of the distribution",
) )
docker_image: Optional[str] = None container_image: Optional[str] = None
providers: Dict[str, Union[str, List[str]]] = Field( providers: Dict[str, Union[str, List[str]]] = Field(
default_factory=dict, default_factory=dict,
description=""" description="""
@ -127,9 +127,9 @@ Reference to the distribution this package refers to. For unregistered (adhoc) p
this could be just a hash this could be just a hash
""", """,
) )
docker_image: Optional[str] = Field( container_image: Optional[str] = Field(
default=None, default=None,
description="Reference to the docker image if this package refers to a container", description="Reference to the container image if this package refers to a container",
) )
apis: List[str] = Field( apis: List[str] = Field(
default_factory=list, default_factory=list,
@ -153,7 +153,7 @@ a default SQLite store will be used.""",
# registry of "resources" in the distribution # registry of "resources" in the distribution
models: List[ModelInput] = Field(default_factory=list) models: List[ModelInput] = Field(default_factory=list)
shields: List[ShieldInput] = Field(default_factory=list) shields: List[ShieldInput] = Field(default_factory=list)
memory_banks: List[MemoryBankInput] = Field(default_factory=list) vector_dbs: List[VectorDBInput] = Field(default_factory=list)
datasets: List[DatasetInput] = Field(default_factory=list) datasets: List[DatasetInput] = Field(default_factory=list)
scoring_fns: List[ScoringFnInput] = Field(default_factory=list) scoring_fns: List[ScoringFnInput] = Field(default_factory=list)
eval_tasks: List[EvalTaskInput] = Field(default_factory=list) eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
@ -168,5 +168,5 @@ class BuildConfig(BaseModel):
) )
image_type: str = Field( image_type: str = Field(
default="conda", default="conda",
description="Type of package to build (conda | docker | venv)", description="Type of package to build (conda | container | venv)",
) )

View file

@ -32,8 +32,8 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]:
router_api=Api.safety, router_api=Api.safety,
), ),
AutoRoutedApiInfo( AutoRoutedApiInfo(
routing_table_api=Api.memory_banks, routing_table_api=Api.vector_dbs,
router_api=Api.memory, router_api=Api.vector_io,
), ),
AutoRoutedApiInfo( AutoRoutedApiInfo(
routing_table_api=Api.datasets, routing_table_api=Api.datasets,

View file

@ -129,8 +129,8 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
import nest_asyncio import nest_asyncio
nest_asyncio.apply() nest_asyncio.apply()
if not self.skip_logger_removal: if not self.skip_logger_removal:
self._remove_root_logger_handlers() self._remove_root_logger_handlers()
return asyncio.run(self.async_client.initialize()) return asyncio.run(self.async_client.initialize())

View file

@ -15,8 +15,6 @@ from llama_stack.apis.eval import Eval
from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.eval_tasks import EvalTasks
from llama_stack.apis.inference import Inference from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect from llama_stack.apis.inspect import Inspect
from llama_stack.apis.memory import Memory
from llama_stack.apis.memory_banks import MemoryBanks
from llama_stack.apis.models import Models from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
@ -25,6 +23,8 @@ from llama_stack.apis.scoring_functions import ScoringFunctions
from llama_stack.apis.shields import Shields from llama_stack.apis.shields import Shields
from llama_stack.apis.telemetry import Telemetry from llama_stack.apis.telemetry import Telemetry
from llama_stack.apis.tools import ToolGroups, ToolRuntime from llama_stack.apis.tools import ToolGroups, ToolRuntime
from llama_stack.apis.vector_dbs import VectorDBs
from llama_stack.apis.vector_io import VectorIO
from llama_stack.distribution.client import get_client_impl from llama_stack.distribution.client import get_client_impl
from llama_stack.distribution.datatypes import ( from llama_stack.distribution.datatypes import (
AutoRoutedProviderSpec, AutoRoutedProviderSpec,
@ -40,7 +40,6 @@ from llama_stack.providers.datatypes import (
DatasetsProtocolPrivate, DatasetsProtocolPrivate,
EvalTasksProtocolPrivate, EvalTasksProtocolPrivate,
InlineProviderSpec, InlineProviderSpec,
MemoryBanksProtocolPrivate,
ModelsProtocolPrivate, ModelsProtocolPrivate,
ProviderSpec, ProviderSpec,
RemoteProviderConfig, RemoteProviderConfig,
@ -48,6 +47,7 @@ from llama_stack.providers.datatypes import (
ScoringFunctionsProtocolPrivate, ScoringFunctionsProtocolPrivate,
ShieldsProtocolPrivate, ShieldsProtocolPrivate,
ToolsProtocolPrivate, ToolsProtocolPrivate,
VectorDBsProtocolPrivate,
) )
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -62,8 +62,8 @@ def api_protocol_map() -> Dict[Api, Any]:
Api.agents: Agents, Api.agents: Agents,
Api.inference: Inference, Api.inference: Inference,
Api.inspect: Inspect, Api.inspect: Inspect,
Api.memory: Memory, Api.vector_io: VectorIO,
Api.memory_banks: MemoryBanks, Api.vector_dbs: VectorDBs,
Api.models: Models, Api.models: Models,
Api.safety: Safety, Api.safety: Safety,
Api.shields: Shields, Api.shields: Shields,
@ -84,7 +84,7 @@ def additional_protocols_map() -> Dict[Api, Any]:
return { return {
Api.inference: (ModelsProtocolPrivate, Models, Api.models), Api.inference: (ModelsProtocolPrivate, Models, Api.models),
Api.tool_groups: (ToolsProtocolPrivate, ToolGroups, Api.tool_groups), Api.tool_groups: (ToolsProtocolPrivate, ToolGroups, Api.tool_groups),
Api.memory: (MemoryBanksProtocolPrivate, MemoryBanks, Api.memory_banks), Api.vector_io: (VectorDBsProtocolPrivate, VectorDBs, Api.vector_dbs),
Api.safety: (ShieldsProtocolPrivate, Shields, Api.shields), Api.safety: (ShieldsProtocolPrivate, Shields, Api.shields),
Api.datasetio: (DatasetsProtocolPrivate, Datasets, Api.datasets), Api.datasetio: (DatasetsProtocolPrivate, Datasets, Api.datasets),
Api.scoring: ( Api.scoring: (
@ -145,7 +145,9 @@ async def resolve_impls(
log.warning( log.warning(
f"Provider `{provider.provider_type}` for API `{api}` is deprecated and will be removed in a future release: {p.deprecation_warning}", f"Provider `{provider.provider_type}` for API `{api}` is deprecated and will be removed in a future release: {p.deprecation_warning}",
) )
p.deps__ = [a.value for a in p.api_dependencies] p.deps__ = [a.value for a in p.api_dependencies] + [
a.value for a in p.optional_api_dependencies
]
spec = ProviderWithSpec( spec = ProviderWithSpec(
spec=p, spec=p,
**(provider.model_dump()), **(provider.model_dump()),
@ -229,6 +231,9 @@ async def resolve_impls(
inner_impls_by_provider_id = {f"inner-{x.value}": {} for x in router_apis} inner_impls_by_provider_id = {f"inner-{x.value}": {} for x in router_apis}
for api_str, provider in sorted_providers: for api_str, provider in sorted_providers:
deps = {a: impls[a] for a in provider.spec.api_dependencies} deps = {a: impls[a] for a in provider.spec.api_dependencies}
for a in provider.spec.optional_api_dependencies:
if a in impls:
deps[a] = impls[a]
inner_impls = {} inner_impls = {}
if isinstance(provider.spec, RoutingTableProviderSpec): if isinstance(provider.spec, RoutingTableProviderSpec):
@ -265,7 +270,7 @@ def topological_sort(
deps.append(dep) deps.append(dep)
for dep in deps: for dep in deps:
if dep not in visited: if dep not in visited and dep in providers_with_specs:
dfs((dep, providers_with_specs[dep]), visited, stack) dfs((dep, providers_with_specs[dep]), visited, stack)
stack.append(api_str) stack.append(api_str)
@ -328,6 +333,8 @@ async def instantiate_provider(
impl.__provider_spec__ = provider_spec impl.__provider_spec__ = provider_spec
impl.__provider_config__ = config impl.__provider_config__ = config
# TODO: check compliance for special tool groups
# the impl should be for Api.tool_runtime, the name should be the special tool group, the protocol should be the special tool group protocol
check_protocol_compliance(impl, protocols[provider_spec.api]) check_protocol_compliance(impl, protocols[provider_spec.api])
if ( if (
not isinstance(provider_spec, AutoRoutedProviderSpec) not isinstance(provider_spec, AutoRoutedProviderSpec)

View file

@ -14,11 +14,11 @@ from llama_stack.providers.datatypes import Api, RoutingTable
from .routing_tables import ( from .routing_tables import (
DatasetsRoutingTable, DatasetsRoutingTable,
EvalTasksRoutingTable, EvalTasksRoutingTable,
MemoryBanksRoutingTable,
ModelsRoutingTable, ModelsRoutingTable,
ScoringFunctionsRoutingTable, ScoringFunctionsRoutingTable,
ShieldsRoutingTable, ShieldsRoutingTable,
ToolGroupsRoutingTable, ToolGroupsRoutingTable,
VectorDBsRoutingTable,
) )
@ -29,7 +29,7 @@ async def get_routing_table_impl(
dist_registry: DistributionRegistry, dist_registry: DistributionRegistry,
) -> Any: ) -> Any:
api_to_tables = { api_to_tables = {
"memory_banks": MemoryBanksRoutingTable, "vector_dbs": VectorDBsRoutingTable,
"models": ModelsRoutingTable, "models": ModelsRoutingTable,
"shields": ShieldsRoutingTable, "shields": ShieldsRoutingTable,
"datasets": DatasetsRoutingTable, "datasets": DatasetsRoutingTable,
@ -51,14 +51,14 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, _deps) ->
DatasetIORouter, DatasetIORouter,
EvalRouter, EvalRouter,
InferenceRouter, InferenceRouter,
MemoryRouter,
SafetyRouter, SafetyRouter,
ScoringRouter, ScoringRouter,
ToolRuntimeRouter, ToolRuntimeRouter,
VectorIORouter,
) )
api_to_routers = { api_to_routers = {
"memory": MemoryRouter, "vector_io": VectorIORouter,
"inference": InferenceRouter, "inference": InferenceRouter,
"safety": SafetyRouter, "safety": SafetyRouter,
"datasetio": DatasetIORouter, "datasetio": DatasetIORouter,

View file

@ -27,8 +27,6 @@ from llama_stack.apis.inference import (
ToolDefinition, ToolDefinition,
ToolPromptFormat, ToolPromptFormat,
) )
from llama_stack.apis.memory import Memory, MemoryBankDocument, QueryDocumentsResponse
from llama_stack.apis.memory_banks.memory_banks import BankParams
from llama_stack.apis.models import ModelType from llama_stack.apis.models import ModelType
from llama_stack.apis.safety import RunShieldResponse, Safety from llama_stack.apis.safety import RunShieldResponse, Safety
from llama_stack.apis.scoring import ( from llama_stack.apis.scoring import (
@ -38,12 +36,20 @@ from llama_stack.apis.scoring import (
ScoringFnParams, ScoringFnParams,
) )
from llama_stack.apis.shields import Shield from llama_stack.apis.shields import Shield
from llama_stack.apis.tools import ToolDef, ToolRuntime from llama_stack.apis.tools import (
RAGDocument,
RAGQueryConfig,
RAGQueryResult,
RAGToolRuntime,
ToolDef,
ToolRuntime,
)
from llama_stack.apis.vector_io import Chunk, QueryChunksResponse, VectorIO
from llama_stack.providers.datatypes import RoutingTable from llama_stack.providers.datatypes import RoutingTable
class MemoryRouter(Memory): class VectorIORouter(VectorIO):
"""Routes to an provider based on the memory bank identifier""" """Routes to an provider based on the vector db identifier"""
def __init__( def __init__(
self, self,
@ -57,38 +63,40 @@ class MemoryRouter(Memory):
async def shutdown(self) -> None: async def shutdown(self) -> None:
pass pass
async def register_memory_bank( async def register_vector_db(
self, self,
memory_bank_id: str, vector_db_id: str,
params: BankParams, embedding_model: str,
embedding_dimension: Optional[int] = 384,
provider_id: Optional[str] = None, provider_id: Optional[str] = None,
provider_memorybank_id: Optional[str] = None, provider_vector_db_id: Optional[str] = None,
) -> None: ) -> None:
await self.routing_table.register_memory_bank( await self.routing_table.register_vector_db(
memory_bank_id, vector_db_id,
params, embedding_model,
embedding_dimension,
provider_id, provider_id,
provider_memorybank_id, provider_vector_db_id,
) )
async def insert_documents( async def insert_chunks(
self, self,
bank_id: str, vector_db_id: str,
documents: List[MemoryBankDocument], chunks: List[Chunk],
ttl_seconds: Optional[int] = None, ttl_seconds: Optional[int] = None,
) -> None: ) -> None:
return await self.routing_table.get_provider_impl(bank_id).insert_documents( return await self.routing_table.get_provider_impl(vector_db_id).insert_chunks(
bank_id, documents, ttl_seconds vector_db_id, chunks, ttl_seconds
) )
async def query_documents( async def query_chunks(
self, self,
bank_id: str, vector_db_id: str,
query: InterleavedContent, query: InterleavedContent,
params: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None,
) -> QueryDocumentsResponse: ) -> QueryChunksResponse:
return await self.routing_table.get_provider_impl(bank_id).query_documents( return await self.routing_table.get_provider_impl(vector_db_id).query_chunks(
bank_id, query, params vector_db_id, query, params
) )
@ -399,22 +407,54 @@ class EvalRouter(Eval):
class ToolRuntimeRouter(ToolRuntime): class ToolRuntimeRouter(ToolRuntime):
class RagToolImpl(RAGToolRuntime):
def __init__(
self,
routing_table: RoutingTable,
) -> None:
self.routing_table = routing_table
async def query(
self,
content: InterleavedContent,
vector_db_ids: List[str],
query_config: Optional[RAGQueryConfig] = None,
) -> RAGQueryResult:
return await self.routing_table.get_provider_impl(
"query_from_memory"
).query(content, vector_db_ids, query_config)
async def insert(
self,
documents: List[RAGDocument],
vector_db_id: str,
chunk_size_in_tokens: int = 512,
) -> None:
return await self.routing_table.get_provider_impl(
"insert_into_memory"
).insert(documents, vector_db_id, chunk_size_in_tokens)
def __init__( def __init__(
self, self,
routing_table: RoutingTable, routing_table: RoutingTable,
) -> None: ) -> None:
self.routing_table = routing_table self.routing_table = routing_table
# HACK ALERT this should be in sync with "get_all_api_endpoints()"
self.rag_tool = self.RagToolImpl(routing_table)
for method in ("query", "insert"):
setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
async def initialize(self) -> None: async def initialize(self) -> None:
pass pass
async def shutdown(self) -> None: async def shutdown(self) -> None:
pass pass
async def invoke_tool(self, tool_name: str, args: Dict[str, Any]) -> Any: async def invoke_tool(self, tool_name: str, kwargs: Dict[str, Any]) -> Any:
return await self.routing_table.get_provider_impl(tool_name).invoke_tool( return await self.routing_table.get_provider_impl(tool_name).invoke_tool(
tool_name=tool_name, tool_name=tool_name,
args=args, kwargs=kwargs,
) )
async def list_runtime_tools( async def list_runtime_tools(

View file

@ -12,13 +12,6 @@ from llama_stack.apis.common.content_types import URL
from llama_stack.apis.common.type_system import ParamType from llama_stack.apis.common.type_system import ParamType
from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse from llama_stack.apis.datasets import Dataset, Datasets, ListDatasetsResponse
from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse from llama_stack.apis.eval_tasks import EvalTask, EvalTasks, ListEvalTasksResponse
from llama_stack.apis.memory_banks import (
BankParams,
ListMemoryBanksResponse,
MemoryBank,
MemoryBanks,
MemoryBankType,
)
from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType
from llama_stack.apis.resource import ResourceType from llama_stack.apis.resource import ResourceType
from llama_stack.apis.scoring_functions import ( from llama_stack.apis.scoring_functions import (
@ -36,6 +29,7 @@ from llama_stack.apis.tools import (
ToolGroups, ToolGroups,
ToolHost, ToolHost,
) )
from llama_stack.apis.vector_dbs import ListVectorDBsResponse, VectorDB, VectorDBs
from llama_stack.distribution.datatypes import ( from llama_stack.distribution.datatypes import (
RoutableObject, RoutableObject,
RoutableObjectWithProvider, RoutableObjectWithProvider,
@ -59,8 +53,8 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
return await p.register_model(obj) return await p.register_model(obj)
elif api == Api.safety: elif api == Api.safety:
return await p.register_shield(obj) return await p.register_shield(obj)
elif api == Api.memory: elif api == Api.vector_io:
return await p.register_memory_bank(obj) return await p.register_vector_db(obj)
elif api == Api.datasetio: elif api == Api.datasetio:
return await p.register_dataset(obj) return await p.register_dataset(obj)
elif api == Api.scoring: elif api == Api.scoring:
@ -75,8 +69,8 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None: async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
api = get_impl_api(p) api = get_impl_api(p)
if api == Api.memory: if api == Api.vector_io:
return await p.unregister_memory_bank(obj.identifier) return await p.unregister_vector_db(obj.identifier)
elif api == Api.inference: elif api == Api.inference:
return await p.unregister_model(obj.identifier) return await p.unregister_model(obj.identifier)
elif api == Api.datasetio: elif api == Api.datasetio:
@ -120,8 +114,8 @@ class CommonRoutingTableImpl(RoutingTable):
p.model_store = self p.model_store = self
elif api == Api.safety: elif api == Api.safety:
p.shield_store = self p.shield_store = self
elif api == Api.memory: elif api == Api.vector_io:
p.memory_bank_store = self p.vector_db_store = self
elif api == Api.datasetio: elif api == Api.datasetio:
p.dataset_store = self p.dataset_store = self
elif api == Api.scoring: elif api == Api.scoring:
@ -145,8 +139,8 @@ class CommonRoutingTableImpl(RoutingTable):
return ("Inference", "model") return ("Inference", "model")
elif isinstance(self, ShieldsRoutingTable): elif isinstance(self, ShieldsRoutingTable):
return ("Safety", "shield") return ("Safety", "shield")
elif isinstance(self, MemoryBanksRoutingTable): elif isinstance(self, VectorDBsRoutingTable):
return ("Memory", "memory_bank") return ("VectorIO", "vector_db")
elif isinstance(self, DatasetsRoutingTable): elif isinstance(self, DatasetsRoutingTable):
return ("DatasetIO", "dataset") return ("DatasetIO", "dataset")
elif isinstance(self, ScoringFunctionsRoutingTable): elif isinstance(self, ScoringFunctionsRoutingTable):
@ -196,9 +190,6 @@ class CommonRoutingTableImpl(RoutingTable):
async def register_object( async def register_object(
self, obj: RoutableObjectWithProvider self, obj: RoutableObjectWithProvider
) -> RoutableObjectWithProvider: ) -> RoutableObjectWithProvider:
# Get existing objects from registry
existing_obj = await self.dist_registry.get(obj.type, obj.identifier)
# if provider_id is not specified, pick an arbitrary one from existing entries # if provider_id is not specified, pick an arbitrary one from existing entries
if not obj.provider_id and len(self.impls_by_provider_id) > 0: if not obj.provider_id and len(self.impls_by_provider_id) > 0:
obj.provider_id = list(self.impls_by_provider_id.keys())[0] obj.provider_id = list(self.impls_by_provider_id.keys())[0]
@ -311,22 +302,23 @@ class ShieldsRoutingTable(CommonRoutingTableImpl, Shields):
return shield return shield
class MemoryBanksRoutingTable(CommonRoutingTableImpl, MemoryBanks): class VectorDBsRoutingTable(CommonRoutingTableImpl, VectorDBs):
async def list_memory_banks(self) -> ListMemoryBanksResponse: async def list_vector_dbs(self) -> ListVectorDBsResponse:
return ListMemoryBanksResponse(data=await self.get_all_with_type("memory_bank")) return ListVectorDBsResponse(data=await self.get_all_with_type("vector_db"))
async def get_memory_bank(self, memory_bank_id: str) -> Optional[MemoryBank]: async def get_vector_db(self, vector_db_id: str) -> Optional[VectorDB]:
return await self.get_object_by_identifier("memory_bank", memory_bank_id) return await self.get_object_by_identifier("vector_db", vector_db_id)
async def register_memory_bank( async def register_vector_db(
self, self,
memory_bank_id: str, vector_db_id: str,
params: BankParams, embedding_model: str,
embedding_dimension: Optional[int] = 384,
provider_id: Optional[str] = None, provider_id: Optional[str] = None,
provider_memory_bank_id: Optional[str] = None, provider_vector_db_id: Optional[str] = None,
) -> MemoryBank: ) -> VectorDB:
if provider_memory_bank_id is None: if provider_vector_db_id is None:
provider_memory_bank_id = memory_bank_id provider_vector_db_id = vector_db_id
if provider_id is None: if provider_id is None:
# If provider_id not specified, use the only provider if it supports this shield type # If provider_id not specified, use the only provider if it supports this shield type
if len(self.impls_by_provider_id) == 1: if len(self.impls_by_provider_id) == 1:
@ -335,44 +327,39 @@ class MemoryBanksRoutingTable(CommonRoutingTableImpl, MemoryBanks):
raise ValueError( raise ValueError(
"No provider specified and multiple providers available. Please specify a provider_id." "No provider specified and multiple providers available. Please specify a provider_id."
) )
model = await self.get_object_by_identifier("model", params.embedding_model) model = await self.get_object_by_identifier("model", embedding_model)
if model is None: if model is None:
if params.embedding_model == "all-MiniLM-L6-v2": if embedding_model == "all-MiniLM-L6-v2":
raise ValueError( raise ValueError(
"Embeddings are now served via Inference providers. " "Embeddings are now served via Inference providers. "
"Please upgrade your run.yaml to include inline::sentence-transformer as an additional inference provider. " "Please upgrade your run.yaml to include inline::sentence-transformer as an additional inference provider. "
"See https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/together/run.yaml for an example." "See https://github.com/meta-llama/llama-stack/blob/main/llama_stack/templates/together/run.yaml for an example."
) )
else: else:
raise ValueError(f"Model {params.embedding_model} not found") raise ValueError(f"Model {embedding_model} not found")
if model.model_type != ModelType.embedding: if model.model_type != ModelType.embedding:
raise ValueError( raise ValueError(f"Model {embedding_model} is not an embedding model")
f"Model {params.embedding_model} is not an embedding model"
)
if "embedding_dimension" not in model.metadata: if "embedding_dimension" not in model.metadata:
raise ValueError( raise ValueError(
f"Model {params.embedding_model} does not have an embedding dimension" f"Model {embedding_model} does not have an embedding dimension"
) )
memory_bank_data = { vector_db_data = {
"identifier": memory_bank_id, "identifier": vector_db_id,
"type": ResourceType.memory_bank.value, "type": ResourceType.vector_db.value,
"provider_id": provider_id, "provider_id": provider_id,
"provider_resource_id": provider_memory_bank_id, "provider_resource_id": provider_vector_db_id,
**params.model_dump(), "embedding_model": embedding_model,
"embedding_dimension": model.metadata["embedding_dimension"],
} }
if params.memory_bank_type == MemoryBankType.vector.value: vector_db = TypeAdapter(VectorDB).validate_python(vector_db_data)
memory_bank_data["embedding_dimension"] = model.metadata[ await self.register_object(vector_db)
"embedding_dimension" return vector_db
]
memory_bank = TypeAdapter(MemoryBank).validate_python(memory_bank_data)
await self.register_object(memory_bank)
return memory_bank
async def unregister_memory_bank(self, memory_bank_id: str) -> None: async def unregister_vector_db(self, vector_db_id: str) -> None:
existing_bank = await self.get_memory_bank(memory_bank_id) existing_vector_db = await self.get_vector_db(vector_db_id)
if existing_bank is None: if existing_vector_db is None:
raise ValueError(f"Memory bank {memory_bank_id} not found") raise ValueError(f"Vector DB {vector_db_id} not found")
await self.unregister_object(existing_bank) await self.unregister_object(existing_vector_db)
class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):

View file

@ -9,6 +9,8 @@ from typing import Dict, List
from pydantic import BaseModel from pydantic import BaseModel
from llama_stack.apis.tools import RAGToolRuntime, SpecialToolGroup
from llama_stack.apis.version import LLAMA_STACK_API_VERSION from llama_stack.apis.version import LLAMA_STACK_API_VERSION
from llama_stack.distribution.resolver import api_protocol_map from llama_stack.distribution.resolver import api_protocol_map
@ -22,21 +24,39 @@ class ApiEndpoint(BaseModel):
name: str name: str
def toolgroup_protocol_map():
return {
SpecialToolGroup.rag_tool: RAGToolRuntime,
}
def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]: def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
apis = {} apis = {}
protocols = api_protocol_map() protocols = api_protocol_map()
toolgroup_protocols = toolgroup_protocol_map()
for api, protocol in protocols.items(): for api, protocol in protocols.items():
endpoints = [] endpoints = []
protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction) protocol_methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
# HACK ALERT
if api == Api.tool_runtime:
for tool_group in SpecialToolGroup:
sub_protocol = toolgroup_protocols[tool_group]
sub_protocol_methods = inspect.getmembers(
sub_protocol, predicate=inspect.isfunction
)
for name, method in sub_protocol_methods:
if not hasattr(method, "__webmethod__"):
continue
protocol_methods.append((f"{tool_group.value}.{name}", method))
for name, method in protocol_methods: for name, method in protocol_methods:
if not hasattr(method, "__webmethod__"): if not hasattr(method, "__webmethod__"):
continue continue
webmethod = method.__webmethod__ webmethod = method.__webmethod__
route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}" route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
if webmethod.method == "GET": if webmethod.method == "GET":
method = "get" method = "get"
elif webmethod.method == "DELETE": elif webmethod.method == "DELETE":

View file

@ -21,8 +21,6 @@ from llama_stack.apis.eval import Eval
from llama_stack.apis.eval_tasks import EvalTasks from llama_stack.apis.eval_tasks import EvalTasks
from llama_stack.apis.inference import Inference from llama_stack.apis.inference import Inference
from llama_stack.apis.inspect import Inspect from llama_stack.apis.inspect import Inspect
from llama_stack.apis.memory import Memory
from llama_stack.apis.memory_banks import MemoryBanks
from llama_stack.apis.models import Models from llama_stack.apis.models import Models
from llama_stack.apis.post_training import PostTraining from llama_stack.apis.post_training import PostTraining
from llama_stack.apis.safety import Safety from llama_stack.apis.safety import Safety
@ -31,7 +29,9 @@ from llama_stack.apis.scoring_functions import ScoringFunctions
from llama_stack.apis.shields import Shields from llama_stack.apis.shields import Shields
from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration
from llama_stack.apis.telemetry import Telemetry from llama_stack.apis.telemetry import Telemetry
from llama_stack.apis.tools import ToolGroups, ToolRuntime from llama_stack.apis.tools import RAGToolRuntime, ToolGroups, ToolRuntime
from llama_stack.apis.vector_dbs import VectorDBs
from llama_stack.apis.vector_io import VectorIO
from llama_stack.distribution.datatypes import StackRunConfig from llama_stack.distribution.datatypes import StackRunConfig
from llama_stack.distribution.distribution import get_provider_registry from llama_stack.distribution.distribution import get_provider_registry
from llama_stack.distribution.resolver import ProviderRegistry, resolve_impls from llama_stack.distribution.resolver import ProviderRegistry, resolve_impls
@ -42,7 +42,7 @@ log = logging.getLogger(__name__)
class LlamaStack( class LlamaStack(
MemoryBanks, VectorDBs,
Inference, Inference,
BatchInference, BatchInference,
Agents, Agents,
@ -51,7 +51,7 @@ class LlamaStack(
Datasets, Datasets,
Telemetry, Telemetry,
PostTraining, PostTraining,
Memory, VectorIO,
Eval, Eval,
EvalTasks, EvalTasks,
Scoring, Scoring,
@ -62,6 +62,7 @@ class LlamaStack(
Inspect, Inspect,
ToolGroups, ToolGroups,
ToolRuntime, ToolRuntime,
RAGToolRuntime,
): ):
pass pass
@ -69,7 +70,7 @@ class LlamaStack(
RESOURCES = [ RESOURCES = [
("models", Api.models, "register_model", "list_models"), ("models", Api.models, "register_model", "list_models"),
("shields", Api.shields, "register_shield", "list_shields"), ("shields", Api.shields, "register_shield", "list_shields"),
("memory_banks", Api.memory_banks, "register_memory_bank", "list_memory_banks"), ("vector_dbs", Api.vector_dbs, "register_vector_db", "list_vector_dbs"),
("datasets", Api.datasets, "register_dataset", "list_datasets"), ("datasets", Api.datasets, "register_dataset", "list_datasets"),
( (
"scoring_fns", "scoring_fns",

View file

@ -6,8 +6,8 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
DOCKER_BINARY=${DOCKER_BINARY:-docker} CONTAINER_BINARY=${CONTAINER_BINARY:-docker}
DOCKER_OPTS=${DOCKER_OPTS:-} CONTAINER_OPTS=${CONTAINER_OPTS:-}
LLAMA_CHECKPOINT_DIR=${LLAMA_CHECKPOINT_DIR:-} LLAMA_CHECKPOINT_DIR=${LLAMA_CHECKPOINT_DIR:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
@ -30,8 +30,8 @@ if [ $# -lt 3 ]; then
exit 1 exit 1
fi fi
build_name="$1" image_name="$1"
docker_image="localhost/distribution-$build_name" container_image="localhost/$image_name"
shift shift
yaml_config="$1" yaml_config="$1"
@ -64,7 +64,7 @@ set -x
if command -v selinuxenabled &> /dev/null && selinuxenabled; then if command -v selinuxenabled &> /dev/null && selinuxenabled; then
# Disable SELinux labels # Disable SELinux labels
DOCKER_OPTS="$DOCKER_OPTS --security-opt label=disable" CONTAINER_OPTS="$CONTAINER_OPTS --security-opt label=disable"
fi fi
mounts="" mounts=""
@ -73,23 +73,25 @@ if [ -n "$LLAMA_STACK_DIR" ]; then
fi fi
if [ -n "$LLAMA_CHECKPOINT_DIR" ]; then if [ -n "$LLAMA_CHECKPOINT_DIR" ]; then
mounts="$mounts -v $LLAMA_CHECKPOINT_DIR:/root/.llama" mounts="$mounts -v $LLAMA_CHECKPOINT_DIR:/root/.llama"
DOCKER_OPTS="$DOCKER_OPTS --gpus=all" CONTAINER_OPTS="$CONTAINER_OPTS --gpus=all"
fi fi
version_tag="latest"
if [ -n "$PYPI_VERSION" ]; then if [ -n "$PYPI_VERSION" ]; then
version_tag="$PYPI_VERSION" version_tag="$PYPI_VERSION"
elif [ -n "$LLAMA_STACK_DIR" ]; then elif [ -n "$LLAMA_STACK_DIR" ]; then
version_tag="dev" version_tag="dev"
elif [ -n "$TEST_PYPI_VERSION" ]; then elif [ -n "$TEST_PYPI_VERSION" ]; then
version_tag="test-$TEST_PYPI_VERSION" version_tag="test-$TEST_PYPI_VERSION"
else
URL="https://pypi.org/pypi/llama-stack/json"
version_tag=$(curl -s $URL | jq -r '.info.version')
fi fi
$DOCKER_BINARY run $DOCKER_OPTS -it \ $CONTAINER_BINARY run $CONTAINER_OPTS -it \
-p $port:$port \ -p $port:$port \
$env_vars \ $env_vars \
-v "$yaml_config:/app/config.yaml" \ -v "$yaml_config:/app/config.yaml" \
$mounts \ $mounts \
--env LLAMA_STACK_PORT=$port \ --env LLAMA_STACK_PORT=$port \
--entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \ --entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \
$docker_image:$version_tag $container_image:$version_tag

View file

@ -35,7 +35,7 @@ class DistributionRegistry(Protocol):
REGISTER_PREFIX = "distributions:registry" REGISTER_PREFIX = "distributions:registry"
KEY_VERSION = "v5" KEY_VERSION = "v7"
KEY_FORMAT = f"{REGISTER_PREFIX}:{KEY_VERSION}::" + "{type}:{identifier}" KEY_FORMAT = f"{REGISTER_PREFIX}:{KEY_VERSION}::" + "{type}:{identifier}"

View file

@ -9,7 +9,7 @@ import os
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from llama_stack.apis.inference import Model from llama_stack.apis.inference import Model
from llama_stack.apis.memory_banks import VectorMemoryBank from llama_stack.apis.vector_dbs import VectorDB
from llama_stack.distribution.store.registry import ( from llama_stack.distribution.store.registry import (
CachedDiskDistributionRegistry, CachedDiskDistributionRegistry,
@ -42,13 +42,12 @@ async def cached_registry(config):
@pytest.fixture @pytest.fixture
def sample_bank(): def sample_vector_db():
return VectorMemoryBank( return VectorDB(
identifier="test_bank", identifier="test_vector_db",
embedding_model="all-MiniLM-L6-v2", embedding_model="all-MiniLM-L6-v2",
chunk_size_in_tokens=512, embedding_dimension=384,
overlap_size_in_tokens=64, provider_resource_id="test_vector_db",
provider_resource_id="test_bank",
provider_id="test-provider", provider_id="test-provider",
) )
@ -70,19 +69,17 @@ async def test_registry_initialization(registry):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_basic_registration(registry, sample_bank, sample_model): async def test_basic_registration(registry, sample_vector_db, sample_model):
print(f"Registering {sample_bank}") print(f"Registering {sample_vector_db}")
await registry.register(sample_bank) await registry.register(sample_vector_db)
print(f"Registering {sample_model}") print(f"Registering {sample_model}")
await registry.register(sample_model) await registry.register(sample_model)
print("Getting bank") print("Getting vector_db")
result_bank = await registry.get("memory_bank", "test_bank") result_vector_db = await registry.get("vector_db", "test_vector_db")
assert result_bank is not None assert result_vector_db is not None
assert result_bank.identifier == sample_bank.identifier assert result_vector_db.identifier == sample_vector_db.identifier
assert result_bank.embedding_model == sample_bank.embedding_model assert result_vector_db.embedding_model == sample_vector_db.embedding_model
assert result_bank.chunk_size_in_tokens == sample_bank.chunk_size_in_tokens assert result_vector_db.provider_id == sample_vector_db.provider_id
assert result_bank.overlap_size_in_tokens == sample_bank.overlap_size_in_tokens
assert result_bank.provider_id == sample_bank.provider_id
result_model = await registry.get("model", "test_model") result_model = await registry.get("model", "test_model")
assert result_model is not None assert result_model is not None
@ -91,24 +88,23 @@ async def test_basic_registration(registry, sample_bank, sample_model):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_cached_registry_initialization(config, sample_bank, sample_model): async def test_cached_registry_initialization(config, sample_vector_db, sample_model):
# First populate the disk registry # First populate the disk registry
disk_registry = DiskDistributionRegistry(await kvstore_impl(config)) disk_registry = DiskDistributionRegistry(await kvstore_impl(config))
await disk_registry.initialize() await disk_registry.initialize()
await disk_registry.register(sample_bank) await disk_registry.register(sample_vector_db)
await disk_registry.register(sample_model) await disk_registry.register(sample_model)
# Test cached version loads from disk # Test cached version loads from disk
cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config)) cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config))
await cached_registry.initialize() await cached_registry.initialize()
result_bank = await cached_registry.get("memory_bank", "test_bank") result_vector_db = await cached_registry.get("vector_db", "test_vector_db")
assert result_bank is not None assert result_vector_db is not None
assert result_bank.identifier == sample_bank.identifier assert result_vector_db.identifier == sample_vector_db.identifier
assert result_bank.embedding_model == sample_bank.embedding_model assert result_vector_db.embedding_model == sample_vector_db.embedding_model
assert result_bank.chunk_size_in_tokens == sample_bank.chunk_size_in_tokens assert result_vector_db.embedding_dimension == sample_vector_db.embedding_dimension
assert result_bank.overlap_size_in_tokens == sample_bank.overlap_size_in_tokens assert result_vector_db.provider_id == sample_vector_db.provider_id
assert result_bank.provider_id == sample_bank.provider_id
@pytest.mark.asyncio @pytest.mark.asyncio
@ -116,29 +112,28 @@ async def test_cached_registry_updates(config):
cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config)) cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config))
await cached_registry.initialize() await cached_registry.initialize()
new_bank = VectorMemoryBank( new_vector_db = VectorDB(
identifier="test_bank_2", identifier="test_vector_db_2",
embedding_model="all-MiniLM-L6-v2", embedding_model="all-MiniLM-L6-v2",
chunk_size_in_tokens=256, embedding_dimension=384,
overlap_size_in_tokens=32, provider_resource_id="test_vector_db_2",
provider_resource_id="test_bank_2",
provider_id="baz", provider_id="baz",
) )
await cached_registry.register(new_bank) await cached_registry.register(new_vector_db)
# Verify in cache # Verify in cache
result_bank = await cached_registry.get("memory_bank", "test_bank_2") result_vector_db = await cached_registry.get("vector_db", "test_vector_db_2")
assert result_bank is not None assert result_vector_db is not None
assert result_bank.identifier == new_bank.identifier assert result_vector_db.identifier == new_vector_db.identifier
assert result_bank.provider_id == new_bank.provider_id assert result_vector_db.provider_id == new_vector_db.provider_id
# Verify persisted to disk # Verify persisted to disk
new_registry = DiskDistributionRegistry(await kvstore_impl(config)) new_registry = DiskDistributionRegistry(await kvstore_impl(config))
await new_registry.initialize() await new_registry.initialize()
result_bank = await new_registry.get("memory_bank", "test_bank_2") result_vector_db = await new_registry.get("vector_db", "test_vector_db_2")
assert result_bank is not None assert result_vector_db is not None
assert result_bank.identifier == new_bank.identifier assert result_vector_db.identifier == new_vector_db.identifier
assert result_bank.provider_id == new_bank.provider_id assert result_vector_db.provider_id == new_vector_db.provider_id
@pytest.mark.asyncio @pytest.mark.asyncio
@ -146,30 +141,28 @@ async def test_duplicate_provider_registration(config):
cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config)) cached_registry = CachedDiskDistributionRegistry(await kvstore_impl(config))
await cached_registry.initialize() await cached_registry.initialize()
original_bank = VectorMemoryBank( original_vector_db = VectorDB(
identifier="test_bank_2", identifier="test_vector_db_2",
embedding_model="all-MiniLM-L6-v2", embedding_model="all-MiniLM-L6-v2",
chunk_size_in_tokens=256, embedding_dimension=384,
overlap_size_in_tokens=32, provider_resource_id="test_vector_db_2",
provider_resource_id="test_bank_2",
provider_id="baz", provider_id="baz",
) )
await cached_registry.register(original_bank) await cached_registry.register(original_vector_db)
duplicate_bank = VectorMemoryBank( duplicate_vector_db = VectorDB(
identifier="test_bank_2", identifier="test_vector_db_2",
embedding_model="different-model", embedding_model="different-model",
chunk_size_in_tokens=128, embedding_dimension=384,
overlap_size_in_tokens=16, provider_resource_id="test_vector_db_2",
provider_resource_id="test_bank_2",
provider_id="baz", # Same provider_id provider_id="baz", # Same provider_id
) )
await cached_registry.register(duplicate_bank) await cached_registry.register(duplicate_vector_db)
result = await cached_registry.get("memory_bank", "test_bank_2") result = await cached_registry.get("vector_db", "test_vector_db_2")
assert result is not None assert result is not None
assert ( assert (
result.embedding_model == original_bank.embedding_model result.embedding_model == original_vector_db.embedding_model
) # Original values preserved ) # Original values preserved
@ -179,36 +172,35 @@ async def test_get_all_objects(config):
await cached_registry.initialize() await cached_registry.initialize()
# Create multiple test banks # Create multiple test banks
test_banks = [ test_vector_dbs = [
VectorMemoryBank( VectorDB(
identifier=f"test_bank_{i}", identifier=f"test_vector_db_{i}",
embedding_model="all-MiniLM-L6-v2", embedding_model="all-MiniLM-L6-v2",
chunk_size_in_tokens=256, embedding_dimension=384,
overlap_size_in_tokens=32, provider_resource_id=f"test_vector_db_{i}",
provider_resource_id=f"test_bank_{i}",
provider_id=f"provider_{i}", provider_id=f"provider_{i}",
) )
for i in range(3) for i in range(3)
] ]
# Register all banks # Register all vector_dbs
for bank in test_banks: for vector_db in test_vector_dbs:
await cached_registry.register(bank) await cached_registry.register(vector_db)
# Test get_all retrieval # Test get_all retrieval
all_results = await cached_registry.get_all() all_results = await cached_registry.get_all()
assert len(all_results) == 3 assert len(all_results) == 3
# Verify each bank was stored correctly # Verify each vector_db was stored correctly
for original_bank in test_banks: for original_vector_db in test_vector_dbs:
matching_banks = [ matching_vector_dbs = [
b for b in all_results if b.identifier == original_bank.identifier v for v in all_results if v.identifier == original_vector_db.identifier
] ]
assert len(matching_banks) == 1 assert len(matching_vector_dbs) == 1
stored_bank = matching_banks[0] stored_vector_db = matching_vector_dbs[0]
assert stored_bank.embedding_model == original_bank.embedding_model assert stored_vector_db.embedding_model == original_vector_db.embedding_model
assert stored_bank.provider_id == original_bank.provider_id assert stored_vector_db.provider_id == original_vector_db.provider_id
assert stored_bank.chunk_size_in_tokens == original_bank.chunk_size_in_tokens
assert ( assert (
stored_bank.overlap_size_in_tokens == original_bank.overlap_size_in_tokens stored_vector_db.embedding_dimension
== original_vector_db.embedding_dimension
) )

View file

@ -18,6 +18,7 @@ class LlamaStackApi:
provider_data={ provider_data={
"fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""), "fireworks_api_key": os.environ.get("FIREWORKS_API_KEY", ""),
"together_api_key": os.environ.get("TOGETHER_API_KEY", ""), "together_api_key": os.environ.get("TOGETHER_API_KEY", ""),
"sambanova_api_key": os.environ.get("SAMBANOVA_API_KEY", ""),
"openai_api_key": os.environ.get("OPENAI_API_KEY", ""), "openai_api_key": os.environ.get("OPENAI_API_KEY", ""),
}, },
) )

View file

@ -1,23 +0,0 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import streamlit as st
from modules.api import llama_stack_api
def memory_banks():
st.header("Memory Banks")
memory_banks_info = {
m.identifier: m.to_dict() for m in llama_stack_api.client.memory_banks.list()
}
if len(memory_banks_info) > 0:
selected_memory_bank = st.selectbox(
"Select a memory bank", list(memory_banks_info.keys())
)
st.json(memory_banks_info[selected_memory_bank])
else:
st.info("No memory banks found")

View file

@ -6,10 +6,10 @@
from page.distribution.datasets import datasets from page.distribution.datasets import datasets
from page.distribution.eval_tasks import eval_tasks from page.distribution.eval_tasks import eval_tasks
from page.distribution.memory_banks import memory_banks
from page.distribution.models import models from page.distribution.models import models
from page.distribution.scoring_functions import scoring_functions from page.distribution.scoring_functions import scoring_functions
from page.distribution.shields import shields from page.distribution.shields import shields
from page.distribution.vector_dbs import vector_dbs
from streamlit_option_menu import option_menu from streamlit_option_menu import option_menu
@ -17,7 +17,7 @@ from streamlit_option_menu import option_menu
def resources_page(): def resources_page():
options = [ options = [
"Models", "Models",
"Memory Banks", "Vector Databases",
"Shields", "Shields",
"Scoring Functions", "Scoring Functions",
"Datasets", "Datasets",
@ -37,8 +37,8 @@ def resources_page():
) )
if selected_resource == "Eval Tasks": if selected_resource == "Eval Tasks":
eval_tasks() eval_tasks()
elif selected_resource == "Memory Banks": elif selected_resource == "Vector Databases":
memory_banks() vector_dbs()
elif selected_resource == "Datasets": elif selected_resource == "Datasets":
datasets() datasets()
elif selected_resource == "Models": elif selected_resource == "Models":

Some files were not shown because too many files have changed in this diff Show more