Merge branch 'main' into add-nvidia-inference-adapter

This commit is contained in:
Matthew Farrellee 2024-11-20 09:37:48 -05:00
commit 8a35dc8b0e
28 changed files with 429 additions and 478 deletions

35
CHANGELOG.md Normal file
View file

@ -0,0 +1,35 @@
# Changelog
## 0.0.53
### Added
- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
- Persistence for registered objects with distribution
- Ability to persist memory banks created for FAISS
- PostgreSQL KVStore implementation
- Environment variable placeholder support in run.yaml files
- Comprehensive Zero-to-Hero notebooks and quickstart guides
- Support for quantized models in Ollama
- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
- Bedrock distribution with safety shields support
- Evals API with task registration and scoring functions
- MMLU and SimpleQA benchmark scoring functions
- Huggingface dataset provider integration for benchmarks
- Support for custom dataset registration from local paths
- Benchmark evaluation CLI tools with visualization tables
- RAG evaluation scoring functions and metrics
- Local persistence for datasets and eval tasks
### Changed
- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
- Changed provider naming convention (`impls``inline`, `adapters``remote`)
- Updated API signatures for dataset and eval task registration
- Restructured folder organization for providers
- Enhanced Docker build configuration
- Added version prefixing for REST API routes
- Enhanced evaluation task registration workflow
- Improved benchmark evaluation output formatting
- Restructured evals folder organization for better modularity
### Removed
- `llama stack configure` command

View file

@ -1,4 +1,5 @@
include requirements.txt include requirements.txt
include distributions/dependencies.json
include llama_stack/distribution/*.sh include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh include llama_stack/cli/scripts/*.sh
include llama_stack/templates/*/*.yaml include llama_stack/templates/*/*.yaml

View file

@ -0,0 +1,171 @@
{
"together": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"together",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"remote-vllm": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"openai",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"fireworks": [
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"fireworks-ai",
"httpx",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"tgi": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"huggingface_hub",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"meta-reference-gpu": [
"accelerate",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"fairscale",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"lm-format-enforcer",
"matplotlib",
"nltk",
"numpy",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"torch",
"torchvision",
"tqdm",
"transformers",
"uvicorn",
"zmq",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
],
"ollama": [
"aiohttp",
"aiosqlite",
"blobfile",
"chardet",
"chromadb-client",
"faiss-cpu",
"fastapi",
"fire",
"httpx",
"matplotlib",
"nltk",
"numpy",
"ollama",
"pandas",
"pillow",
"psycopg2-binary",
"pypdf",
"redis",
"scikit-learn",
"scipy",
"sentencepiece",
"tqdm",
"transformers",
"uvicorn",
"sentence-transformers --no-deps",
"torch --index-url https://download.pytorch.org/whl/cpu"
]
}

View file

@ -21,7 +21,7 @@
"info": { "info": {
"title": "[DRAFT] Llama Stack Specification", "title": "[DRAFT] Llama Stack Specification",
"version": "alpha", "version": "alpha",
"description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-11-18 23:37:24.867143" "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-11-19 09:14:01.145131"
}, },
"servers": [ "servers": [
{ {

View file

@ -3400,7 +3400,7 @@ info:
description: "This is the specification of the llama stack that provides\n \ description: "This is the specification of the llama stack that provides\n \
\ a set of endpoints and their corresponding interfaces that are tailored\ \ a set of endpoints and their corresponding interfaces that are tailored\
\ to\n best leverage Llama Models. The specification is still in\ \ to\n best leverage Llama Models. The specification is still in\
\ draft and subject to change.\n Generated at 2024-11-18 23:37:24.867143" \ draft and subject to change.\n Generated at 2024-11-19 09:14:01.145131"
title: '[DRAFT] Llama Stack Specification' title: '[DRAFT] Llama Stack Specification'
version: alpha version: alpha
jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema

View file

@ -25,8 +25,8 @@ The following models are available by default:
- `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)` - `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)`
- `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)` - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)`
- `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)` - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)` - `meta-llama/Llama-3.2-1B-Instruct (fireworks/llama-v3p2-1b-instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-3b-instruct)` - `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)` - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)`
- `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)` - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)`
- `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)` - `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)`

View file

@ -23,5 +23,6 @@ tgi
dell-tgi dell-tgi
together together
fireworks fireworks
remote-vllm
bedrock bedrock
``` ```

View file

@ -53,9 +53,9 @@ Please see our pages in detail for the types of distributions we offer:
3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device. 3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
### Quick Start Commands ### Table of Contents
Once you have decided on the inference provider and distribution to use, use the following quick start commands to get started. Once you have decided on the inference provider and distribution to use, use the following guides to get started.
##### 1.0 Prerequisite ##### 1.0 Prerequisite
@ -109,421 +109,33 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew
##### 1.1. Start the distribution ##### 1.1. Start the distribution
**(Option 1) Via Docker**
::::{tab-set} ::::{tab-set}
:::{tab-item} meta-reference-gpu :::{tab-item} meta-reference-gpu
``` - [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
$ cd llama-stack/distributions/meta-reference-gpu && docker compose up
```
This will download and start running a pre-built Docker container. Alternatively, you may use the following commands:
```
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml
```
::: :::
:::{tab-item} vLLM :::{tab-item} vLLM
``` - [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
$ cd llama-stack/distributions/remote-vllm && docker compose up
```
The script will first start up vLLM server on port 8000, then start up Llama Stack distribution server hooking up to it for inference. You should see the following outputs --
```
<TO BE FILLED>
```
To kill the server
```
docker compose down
```
::: :::
:::{tab-item} tgi :::{tab-item} tgi
``` - [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
$ cd llama-stack/distributions/tgi && docker compose up
```
The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should see the following outputs --
```
[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
```
To kill the server
```
docker compose down
```
:::
:::{tab-item} ollama
```
$ cd llama-stack/distributions/ollama && docker compose up
# OR
$ cd llama-stack/distributions/ollama-gpu && docker compose up
```
You will see outputs similar to following ---
```
[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps"
[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps"
INFO: Started server process [1]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
[llamastack] | Resolved 12 providers
[llamastack] | inner-inference => ollama0
[llamastack] | models => __routing_table__
[llamastack] | inference => __autorouted__
```
To kill the server
```
docker compose down
```
:::
:::{tab-item} fireworks
```
$ cd llama-stack/distributions/fireworks && docker compose up
```
Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
```
inference:
- provider_id: fireworks
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inference
api_key: <optional api key>
```
:::
:::{tab-item} together
```
$ cd distributions/together && docker compose up
```
Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
```
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: <optional api key>
```
:::
::::
**(Option 2) Via Conda**
::::{tab-set}
:::{tab-item} meta-reference-gpu
1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
2. Build the `meta-reference-gpu` distribution
```
$ llama stack build --template meta-reference-gpu --image-type conda
```
3. Start running distribution
```
$ llama stack run ~/.llama/distributions/llamastack-meta-reference-gpu/meta-reference-gpu-run.yaml
```
Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
```
memory:
- provider_id: faiss-0
provider_type: faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/faiss_store.db
```
:::
:::{tab-item} tgi
1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
2. Build the `tgi` distribution
```bash
llama stack build --template tgi --image-type conda
```
3. Start a TGI server endpoint
4. Make sure in your `run.yaml` file, your `conda_env` is pointing to the conda environment and inference provider is pointing to the correct TGI server endpoint. E.g.
```
conda_env: llamastack-tgi
...
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
```
5. Start Llama Stack server
```bash
$ llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml
```
Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
```
memory:
- provider_id: faiss-0
provider_type: faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/faiss_store.db
```
::: :::
:::{tab-item} ollama :::{tab-item} ollama
- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands.
#### Start Ollama server.
- Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details.
**Via Docker**
```
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
```
**Via CLI**
```
ollama run <model_id>
```
#### Start Llama Stack server pointing to Ollama server
Make sure your `run.yaml` file has the inference provider pointing to the correct Ollama endpoint. E.g.
```
conda_env: llamastack-ollama
...
inference:
- provider_id: ollama0
provider_type: remote::ollama
config:
url: http://127.0.0.1:11434
```
```
llama stack build --template ollama --image-type conda
llama stack run ~/.llama/distributions/llamastack-ollama/ollama-run.yaml
```
Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
```
memory:
- provider_id: faiss-0
provider_type: faiss
config:
kvstore:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/faiss_store.db
```
:::
:::{tab-item} fireworks
```bash
llama stack build --template fireworks --image-type conda
# -- modify run.yaml to a valid Fireworks server endpoint
llama stack run ./run.yaml
```
Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
```
conda_env: llamastack-fireworks
...
inference:
- provider_id: fireworks
provider_type: remote::fireworks
config:
url: https://api.fireworks.ai/inference
api_key: <optional api key>
```
::: :::
:::{tab-item} together :::{tab-item} together
- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
```bash
llama stack build --template together --image-type conda
# -- modify run.yaml to a valid Together server endpoint
llama stack run ~/.llama/distributions/llamastack-together/together-run.yaml
```
Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
```
conda_env: llamastack-together
...
inference:
- provider_id: together
provider_type: remote::together
config:
url: https://api.together.xyz/v1
api_key: <optional api key>
```
:::
::::
##### 1.2 (Optional) Update Model Serving Configuration
::::{tab-set}
:::{tab-item} meta-reference-gpu
You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`.
```
inference:
- provider_id: meta0
provider_type: inline::meta-reference
config:
model: Llama3.2-11B-Vision-Instruct
quantization: null
torch_seed: null
max_seq_len: 4096
max_batch_size: 1
```
Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
:::
:::{tab-item} tgi
To serve a new model with `tgi`, change the docker command flag `--model-id <model-to-serve>`.
This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve.
```
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
```
or by changing the docker run command's `--model-id` flag
```
docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009
```
Make sure your `run.yaml` file has the inference provider pointing to the TGI server endpoint serving your model.
```
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: http://127.0.0.1:5009
```
```
Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
:::
:::{tab-item} ollama
You can use ollama for managing model downloads.
```
ollama pull llama3.1:8b-instruct-fp16
ollama pull llama3.1:70b-instruct-fp16
```
> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
To serve a new model with `ollama`
```
ollama run <model_name>
```
To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
```
$ ollama ps
NAME ID SIZE PROCESSOR UNTIL
llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now
```
To verify that the model served by ollama is correctly connected to Llama Stack server
```
$ llama-stack-client models list
+----------------------+----------------------+---------------+-----------------------------------------------+
| identifier | llama_model | provider_id | metadata |
+======================+======================+===============+===============================================+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
+----------------------+----------------------+---------------+-----------------------------------------------+
```
:::
:::{tab-item} together
Use `llama-stack-client models list` to check the available models served by together.
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} |
+------------------------------+------------------------------+---------------+------------+
```
::: :::
:::{tab-item} fireworks :::{tab-item} fireworks
Use `llama-stack-client models list` to check the available models served by Fireworks. - [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
```
$ llama-stack-client models list
+------------------------------+------------------------------+---------------+------------+
| identifier | llama_model | provider_id | metadata |
+==============================+==============================+===============+============+
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-1B-Instruct | Llama3.2-1B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0 | {} |
+------------------------------+------------------------------+---------------+------------+
```
::: :::
:::: ::::
##### Troubleshooting ##### Troubleshooting
- If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue. - If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
- Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag. - Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.
@ -535,10 +147,10 @@ $ llama-stack-client models list
Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API: Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API:
```bash ```bash
$ curl http://localhost:5000/inference/chat_completion \ $ curl http://localhost:5000/alpha/inference/chat-completion \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model_id": "Llama3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct",
"messages": [ "messages": [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write me a 2 sentence poem about the moon"} {"role": "user", "content": "Write me a 2 sentence poem about the moon"}

View file

@ -22,14 +22,22 @@ If you're looking for more specific topics like tool calling or agent setup, we
- Download and unzip `Ollama-darwin.zip`. - Download and unzip `Ollama-darwin.zip`.
- Run the `Ollama` application. - Run the `Ollama` application.
2. **Download the Ollama CLI**: 1. **Download the Ollama CLI**:
- Ensure you have the `ollama` command line tool by downloading and installing it from the same website. - Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
3. **Verify Installation**: 1. **Start ollama server**:
- Open the terminal and run:
```
ollama serve
```
1. **Run the model**:
- Open the terminal and run: - Open the terminal and run:
```bash ```bash
ollama run llama3.2:1b ollama run llama3.2:3b-instruct-fp16
``` ```
**Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
--- ---
@ -84,6 +92,8 @@ If you're looking for more specific topics like tool calling or agent setup, we
```bash ```bash
llama stack run /path/to/your/distro/llamastack-ollama/ollama-run.yaml --port 5050 llama stack run /path/to/your/distro/llamastack-ollama/ollama-run.yaml --port 5050
``` ```
Note:
1. Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model
The server will start and listen on `http://localhost:5050`. The server will start and listen on `http://localhost:5050`.
@ -97,7 +107,7 @@ After setting up the server, open a new terminal window and verify it's working
curl http://localhost:5050/inference/chat_completion \ curl http://localhost:5050/inference/chat_completion \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "llama3.2:1b", "model": "Llama3.2-3B-Instruct",
"messages": [ "messages": [
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Write me a 2-sentence poem about the moon"} {"role": "user", "content": "Write me a 2-sentence poem about the moon"}
@ -106,6 +116,8 @@ curl http://localhost:5050/inference/chat_completion \
}' }'
``` ```
You can check the available models with the command `llama-stack-client models list`.
**Expected Output:** **Expected Output:**
```json ```json
{ {

View file

@ -54,6 +54,7 @@ class ToolDefinitionCommon(BaseModel):
class SearchEngineType(Enum): class SearchEngineType(Enum):
bing = "bing" bing = "bing"
brave = "brave" brave = "brave"
tavily = "tavily"
@json_schema_type @json_schema_type

View file

@ -380,6 +380,7 @@ def _hf_download(
def _meta_download( def _meta_download(
model: "Model", model: "Model",
model_id: str,
meta_url: str, meta_url: str,
info: "LlamaDownloadInfo", info: "LlamaDownloadInfo",
max_concurrent_downloads: int, max_concurrent_downloads: int,
@ -405,8 +406,15 @@ def _meta_download(
downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads) downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
asyncio.run(downloader.download_all(tasks)) asyncio.run(downloader.download_all(tasks))
print(f"\nSuccessfully downloaded model to {output_dir}") cprint(f"\nSuccessfully downloaded model to {output_dir}", "green")
cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white") cprint(
f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
"white",
)
cprint(
f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
"yellow",
)
class ModelEntry(BaseModel): class ModelEntry(BaseModel):
@ -512,7 +520,7 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
) )
if "llamameta.net" not in meta_url: if "llamameta.net" not in meta_url:
parser.error("Invalid Meta URL provided") parser.error("Invalid Meta URL provided")
_meta_download(model, meta_url, info, args.max_parallel) _meta_download(model, model_id, meta_url, info, args.max_parallel)
except Exception as e: except Exception as e:
parser.error(f"Download failed: {str(e)}") parser.error(f"Download failed: {str(e)}")

View file

@ -9,6 +9,7 @@
LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-} LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
BUILD_PLATFORM=${BUILD_PLATFORM:-}
if [ "$#" -lt 4 ]; then if [ "$#" -lt 4 ]; then
echo "Usage: $0 <build_name> <docker_base> <pip_dependencies> [<special_pip_deps>]" >&2 echo "Usage: $0 <build_name> <docker_base> <pip_dependencies> [<special_pip_deps>]" >&2
@ -96,7 +97,7 @@ else
add_to_docker "RUN pip install fastapi libcst" add_to_docker "RUN pip install fastapi libcst"
add_to_docker <<EOF add_to_docker <<EOF
RUN pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \ RUN pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
llama-models==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
EOF EOF
else else
add_to_docker "RUN pip install --no-cache llama-stack" add_to_docker "RUN pip install --no-cache llama-stack"
@ -116,7 +117,6 @@ RUN pip install --no-cache $models_mount
EOF EOF
fi fi
add_to_docker <<EOF add_to_docker <<EOF
# This would be good in production but for debugging flexibility lets not add it right now # This would be good in production but for debugging flexibility lets not add it right now
@ -158,7 +158,9 @@ image_tag="$image_name:$version_tag"
# Detect platform architecture # Detect platform architecture
ARCH=$(uname -m) ARCH=$(uname -m)
if [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then if [ -n "$BUILD_PLATFORM" ]; then
PLATFORM="--platform $BUILD_PLATFORM"
elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
PLATFORM="--platform linux/arm64" PLATFORM="--platform linux/arm64"
elif [ "$ARCH" = "x86_64" ]; then elif [ "$ARCH" = "x86_64" ]; then
PLATFORM="--platform linux/amd64" PLATFORM="--platform linux/amd64"

View file

@ -15,6 +15,8 @@ import httpx
from pydantic import BaseModel, parse_obj_as from pydantic import BaseModel, parse_obj_as
from termcolor import cprint from termcolor import cprint
from llama_stack.apis.version import LLAMA_STACK_API_VERSION
from llama_stack.providers.datatypes import RemoteProviderConfig from llama_stack.providers.datatypes import RemoteProviderConfig
_CLIENT_CLASSES = {} _CLIENT_CLASSES = {}
@ -117,7 +119,7 @@ def create_api_client_class(protocol) -> Type:
break break
kwargs[param.name] = args[i] kwargs[param.name] = args[i]
url = f"{self.base_url}{webmethod.route}" url = f"{self.base_url}/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
def convert(value): def convert(value):
if isinstance(value, list): if isinstance(value, list):

View file

@ -9,6 +9,8 @@ from typing import Dict, List
from pydantic import BaseModel from pydantic import BaseModel
from llama_stack.apis.version import LLAMA_STACK_API_VERSION
from llama_stack.distribution.resolver import api_protocol_map from llama_stack.distribution.resolver import api_protocol_map
from llama_stack.providers.datatypes import Api from llama_stack.providers.datatypes import Api
@ -33,7 +35,7 @@ def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
continue continue
webmethod = method.__webmethod__ webmethod = method.__webmethod__
route = webmethod.route route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
if webmethod.method == "GET": if webmethod.method == "GET":
method = "get" method = "get"

View file

@ -86,10 +86,13 @@ class PhotogenTool(SingleMessageBuiltinTool):
class SearchTool(SingleMessageBuiltinTool): class SearchTool(SingleMessageBuiltinTool):
def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None: def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None:
self.api_key = api_key self.api_key = api_key
self.engine_type = engine
if engine == SearchEngineType.bing: if engine == SearchEngineType.bing:
self.engine = BingSearch(api_key, **kwargs) self.engine = BingSearch(api_key, **kwargs)
elif engine == SearchEngineType.brave: elif engine == SearchEngineType.brave:
self.engine = BraveSearch(api_key, **kwargs) self.engine = BraveSearch(api_key, **kwargs)
elif engine == SearchEngineType.tavily:
self.engine = TavilySearch(api_key, **kwargs)
else: else:
raise ValueError(f"Unknown search engine: {engine}") raise ValueError(f"Unknown search engine: {engine}")
@ -257,6 +260,21 @@ class BraveSearch:
return {"query": query, "top_k": clean_response} return {"query": query, "top_k": clean_response}
class TavilySearch:
def __init__(self, api_key: str) -> None:
self.api_key = api_key
async def search(self, query: str) -> str:
response = requests.post(
"https://api.tavily.com/search",
json={"api_key": self.api_key, "query": query},
)
return json.dumps(self._clean_tavily_response(response.json()))
def _clean_tavily_response(self, search_response, top_k=3):
return {"query": search_response["query"], "top_k": search_response["results"]}
class WolframAlphaTool(SingleMessageBuiltinTool): class WolframAlphaTool(SingleMessageBuiltinTool):
def __init__(self, api_key: str) -> None: def __init__(self, api_key: str) -> None:
self.api_key = api_key self.api_key = api_key

View file

@ -50,11 +50,11 @@ MODEL_ALIASES = [
), ),
build_model_alias( build_model_alias(
"fireworks/llama-v3p2-1b-instruct", "fireworks/llama-v3p2-1b-instruct",
CoreModelId.llama3_2_3b_instruct.value, CoreModelId.llama3_2_1b_instruct.value,
), ),
build_model_alias( build_model_alias(
"fireworks/llama-v3p2-3b-instruct", "fireworks/llama-v3p2-3b-instruct",
CoreModelId.llama3_2_11b_vision_instruct.value, CoreModelId.llama3_2_3b_instruct.value,
), ),
build_model_alias( build_model_alias(
"fireworks/llama-v3p2-11b-vision-instruct", "fireworks/llama-v3p2-11b-vision-instruct",
@ -214,10 +214,10 @@ class FireworksInferenceAdapter(
async def _to_async_generator(): async def _to_async_generator():
if "messages" in params: if "messages" in params:
stream = await self._get_client().chat.completions.acreate(**params) stream = self._get_client().chat.completions.acreate(**params)
else: else:
stream = self._get_client().completion.create(**params) stream = self._get_client().completion.acreate(**params)
for chunk in stream: async for chunk in stream:
yield chunk yield chunk
stream = _to_async_generator() stream = _to_async_generator()

View file

@ -264,6 +264,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
class TGIAdapter(_HfAdapter): class TGIAdapter(_HfAdapter):
async def initialize(self, config: TGIImplConfig) -> None: async def initialize(self, config: TGIImplConfig) -> None:
print(f"Initializing TGI client with url={config.url}")
self.client = AsyncInferenceClient(model=config.url, token=config.api_token) self.client = AsyncInferenceClient(model=config.url, token=config.api_token)
endpoint_info = await self.client.get_endpoint_info() endpoint_info = await self.client.get_endpoint_info()
self.max_tokens = endpoint_info["max_total_tokens"] self.max_tokens = endpoint_info["max_total_tokens"]

View file

@ -53,6 +53,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
self.client = None self.client = None
async def initialize(self) -> None: async def initialize(self) -> None:
print(f"Initializing VLLM client with base_url={self.config.url}")
self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token) self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
async def shutdown(self) -> None: async def shutdown(self) -> None:

View file

@ -68,6 +68,73 @@ def query_attachment_messages():
] ]
async def create_agent_turn_with_search_tool(
agents_stack: Dict[str, object],
search_query_messages: List[object],
common_params: Dict[str, str],
search_tool_definition: SearchToolDefinition,
) -> None:
"""
Create an agent turn with a search tool.
Args:
agents_stack (Dict[str, object]): The agents stack.
search_query_messages (List[object]): The search query messages.
common_params (Dict[str, str]): The common parameters.
search_tool_definition (SearchToolDefinition): The search tool definition.
"""
# Create an agent with the search tool
agent_config = AgentConfig(
**{
**common_params,
"tools": [search_tool_definition],
}
)
agent_id, session_id = await create_agent_session(
agents_stack.impls[Api.agents], agent_config
)
turn_request = dict(
agent_id=agent_id,
session_id=session_id,
messages=search_query_messages,
stream=True,
)
turn_response = [
chunk
async for chunk in await agents_stack.impls[Api.agents].create_agent_turn(
**turn_request
)
]
assert len(turn_response) > 0
assert all(
isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
)
check_event_types(turn_response)
# Check for tool execution events
tool_execution_events = [
chunk
for chunk in turn_response
if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
and chunk.event.payload.step_details.step_type == StepType.tool_execution.value
]
assert len(tool_execution_events) > 0, "No tool execution events found"
# Check the tool execution details
tool_execution = tool_execution_events[0].event.payload.step_details
assert isinstance(tool_execution, ToolExecutionStep)
assert len(tool_execution.tool_calls) > 0
assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
assert len(tool_execution.tool_responses) > 0
check_turn_complete_event(turn_response, session_id, search_query_messages)
class TestAgents: class TestAgents:
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_agent_turns_with_safety( async def test_agent_turns_with_safety(
@ -215,63 +282,34 @@ class TestAgents:
async def test_create_agent_turn_with_brave_search( async def test_create_agent_turn_with_brave_search(
self, agents_stack, search_query_messages, common_params self, agents_stack, search_query_messages, common_params
): ):
agents_impl = agents_stack.impls[Api.agents]
if "BRAVE_SEARCH_API_KEY" not in os.environ: if "BRAVE_SEARCH_API_KEY" not in os.environ:
pytest.skip("BRAVE_SEARCH_API_KEY not set, skipping test") pytest.skip("BRAVE_SEARCH_API_KEY not set, skipping test")
# Create an agent with Brave search tool search_tool_definition = SearchToolDefinition(
agent_config = AgentConfig( type=AgentTool.brave_search.value,
**{ api_key=os.environ["BRAVE_SEARCH_API_KEY"],
**common_params, engine=SearchEngineType.brave,
"tools": [ )
SearchToolDefinition( await create_agent_turn_with_search_tool(
type=AgentTool.brave_search.value, agents_stack, search_query_messages, common_params, search_tool_definition
api_key=os.environ["BRAVE_SEARCH_API_KEY"],
engine=SearchEngineType.brave,
)
],
}
) )
agent_id, session_id = await create_agent_session(agents_impl, agent_config) @pytest.mark.asyncio
turn_request = dict( async def test_create_agent_turn_with_tavily_search(
agent_id=agent_id, self, agents_stack, search_query_messages, common_params
session_id=session_id, ):
messages=search_query_messages, if "TAVILY_SEARCH_API_KEY" not in os.environ:
stream=True, pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
search_tool_definition = SearchToolDefinition(
type=AgentTool.brave_search.value, # place holder only
api_key=os.environ["TAVILY_SEARCH_API_KEY"],
engine=SearchEngineType.tavily,
) )
await create_agent_turn_with_search_tool(
turn_response = [ agents_stack, search_query_messages, common_params, search_tool_definition
chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
]
assert len(turn_response) > 0
assert all(
isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
) )
check_event_types(turn_response)
# Check for tool execution events
tool_execution_events = [
chunk
for chunk in turn_response
if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
and chunk.event.payload.step_details.step_type
== StepType.tool_execution.value
]
assert len(tool_execution_events) > 0, "No tool execution events found"
# Check the tool execution details
tool_execution = tool_execution_events[0].event.payload.step_details
assert isinstance(tool_execution, ToolExecutionStep)
assert len(tool_execution.tool_calls) > 0
assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
assert len(tool_execution.tool_responses) > 0
check_turn_complete_event(turn_response, session_id, search_query_messages)
def check_event_types(turn_response): def check_event_types(turn_response):
event_types = [chunk.event.payload.event_type for chunk in turn_response] event_types = [chunk.event.payload.event_type for chunk in turn_response]

View file

@ -25,7 +25,11 @@ from .utils import group_chunks
def get_expected_stop_reason(model: str): def get_expected_stop_reason(model: str):
return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn return (
StopReason.end_of_message
if ("Llama3.1" in model or "Llama-3.1" in model)
else StopReason.end_of_turn
)
@pytest.fixture @pytest.fixture
@ -34,7 +38,7 @@ def common_params(inference_model):
"tool_choice": ToolChoice.auto, "tool_choice": ToolChoice.auto,
"tool_prompt_format": ( "tool_prompt_format": (
ToolPromptFormat.json ToolPromptFormat.json
if "Llama3.1" in inference_model if ("Llama3.1" in inference_model or "Llama-3.1" in inference_model)
else ToolPromptFormat.python_list else ToolPromptFormat.python_list
), ),
} }

View file

@ -6,6 +6,7 @@
import concurrent.futures import concurrent.futures
import importlib import importlib
import json
import subprocess import subprocess
import sys import sys
from functools import partial from functools import partial
@ -14,6 +15,11 @@ from typing import Iterator
from rich.progress import Progress, SpinnerColumn, TextColumn from rich.progress import Progress, SpinnerColumn, TextColumn
from llama_stack.distribution.build import (
get_provider_dependencies,
SERVER_DEPENDENCIES,
)
REPO_ROOT = Path(__file__).parent.parent.parent REPO_ROOT = Path(__file__).parent.parent.parent
@ -67,6 +73,39 @@ def check_for_changes() -> bool:
return result.returncode != 0 return result.returncode != 0
def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]:
try:
module_name = f"llama_stack.templates.{template_dir.name}"
module = importlib.import_module(module_name)
if template_func := getattr(module, "get_distribution_template", None):
template = template_func()
normal_deps, special_deps = get_provider_dependencies(template.providers)
# Combine all dependencies in order: normal deps, special deps, server deps
all_deps = sorted(list(set(normal_deps + SERVER_DEPENDENCIES))) + sorted(
list(set(special_deps))
)
return template.name, all_deps
except Exception:
return None, []
return None, []
def generate_dependencies_file():
templates_dir = REPO_ROOT / "llama_stack" / "templates"
distribution_deps = {}
for template_dir in find_template_dirs(templates_dir):
name, deps = collect_template_dependencies(template_dir)
if name:
distribution_deps[name] = deps
deps_file = REPO_ROOT / "distributions" / "dependencies.json"
with open(deps_file, "w") as f:
json.dump(distribution_deps, f, indent=2)
def main(): def main():
templates_dir = REPO_ROOT / "llama_stack" / "templates" templates_dir = REPO_ROOT / "llama_stack" / "templates"
@ -88,6 +127,8 @@ def main():
list(executor.map(process_func, template_dirs)) list(executor.map(process_func, template_dirs))
progress.update(task, advance=len(template_dirs)) progress.update(task, advance=len(template_dirs))
generate_dependencies_file()
if check_for_changes(): if check_for_changes():
print( print(
"Distribution template changes detected. Please commit the changes.", "Distribution template changes detected. Please commit the changes.",

View file

@ -57,11 +57,11 @@ models:
provider_id: null provider_id: null
provider_model_id: fireworks/llama-v3p1-405b-instruct provider_model_id: fireworks/llama-v3p1-405b-instruct
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-3B-Instruct model_id: meta-llama/Llama-3.2-1B-Instruct
provider_id: null provider_id: null
provider_model_id: fireworks/llama-v3p2-1b-instruct provider_model_id: fireworks/llama-v3p2-1b-instruct
- metadata: {} - metadata: {}
model_id: meta-llama/Llama-3.2-11B-Vision-Instruct model_id: meta-llama/Llama-3.2-3B-Instruct
provider_id: null provider_id: null
provider_model_id: fireworks/llama-v3p2-3b-instruct provider_model_id: fireworks/llama-v3p2-3b-instruct
- metadata: {} - metadata: {}

View file

@ -2,7 +2,7 @@ version: '2'
name: tgi name: tgi
distribution_spec: distribution_spec:
description: Use (an external) TGI server for running LLM inference description: Use (an external) TGI server for running LLM inference
docker_image: llamastack/distribution-tgi:test-0.0.52rc3 docker_image: null
providers: providers:
inference: inference:
- remote::tgi - remote::tgi

View file

@ -1,6 +1,6 @@
version: '2' version: '2'
image_name: tgi image_name: tgi
docker_image: llamastack/distribution-tgi:test-0.0.52rc3 docker_image: null
conda_env: tgi conda_env: tgi
apis: apis:
- agents - agents

View file

@ -1,6 +1,6 @@
version: '2' version: '2'
image_name: tgi image_name: tgi
docker_image: llamastack/distribution-tgi:test-0.0.52rc3 docker_image: null
conda_env: tgi conda_env: tgi
apis: apis:
- agents - agents

View file

@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
name="tgi", name="tgi",
distro_type="self_hosted", distro_type="self_hosted",
description="Use (an external) TGI server for running LLM inference", description="Use (an external) TGI server for running LLM inference",
docker_image="llamastack/distribution-tgi:test-0.0.52rc3", docker_image=None,
template_path=Path(__file__).parent / "doc_template.md", template_path=Path(__file__).parent / "doc_template.md",
providers=providers, providers=providers,
default_models=[inference_model, safety_model], default_models=[inference_model, safety_model],

View file

@ -2,7 +2,8 @@ blobfile
fire fire
httpx httpx
huggingface-hub huggingface-hub
llama-models>=0.0.50 llama-models>=0.0.53
llama-stack-client>=0.0.53
prompt-toolkit prompt-toolkit
python-dotenv python-dotenv
pydantic>=2 pydantic>=2

View file

@ -16,7 +16,7 @@ def read_requirements():
setup( setup(
name="llama_stack", name="llama_stack",
version="0.0.50", version="0.0.53",
author="Meta Llama", author="Meta Llama",
author_email="llama-oss@meta.com", author_email="llama-oss@meta.com",
description="Llama Stack", description="Llama Stack",