diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..b081678c4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,35 @@ +# Changelog + +## 0.0.53 + +### Added +- Resource-oriented design for models, shields, memory banks, datasets and eval tasks +- Persistence for registered objects with distribution +- Ability to persist memory banks created for FAISS +- PostgreSQL KVStore implementation +- Environment variable placeholder support in run.yaml files +- Comprehensive Zero-to-Hero notebooks and quickstart guides +- Support for quantized models in Ollama +- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM +- Bedrock distribution with safety shields support +- Evals API with task registration and scoring functions +- MMLU and SimpleQA benchmark scoring functions +- Huggingface dataset provider integration for benchmarks +- Support for custom dataset registration from local paths +- Benchmark evaluation CLI tools with visualization tables +- RAG evaluation scoring functions and metrics +- Local persistence for datasets and eval tasks + +### Changed +- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner) +- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`) +- Updated API signatures for dataset and eval task registration +- Restructured folder organization for providers +- Enhanced Docker build configuration +- Added version prefixing for REST API routes +- Enhanced evaluation task registration workflow +- Improved benchmark evaluation output formatting +- Restructured evals folder organization for better modularity + +### Removed +- `llama stack configure` command diff --git a/MANIFEST.in b/MANIFEST.in index 27cb775f7..4d1843051 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include requirements.txt +include distributions/dependencies.json include llama_stack/distribution/*.sh include llama_stack/cli/scripts/*.sh include llama_stack/templates/*/*.yaml diff --git a/distributions/dependencies.json b/distributions/dependencies.json new file mode 100644 index 000000000..92ebd1105 --- /dev/null +++ b/distributions/dependencies.json @@ -0,0 +1,171 @@ +{ + "together": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "together", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "remote-vllm": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "fireworks": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "fireworks-ai", + "httpx", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "tgi": [ + "aiohttp", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "meta-reference-gpu": [ + "accelerate", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "fairscale", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "lm-format-enforcer", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "torch", + "torchvision", + "tqdm", + "transformers", + "uvicorn", + "zmq", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "ollama": [ + "aiohttp", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "ollama", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ] +} diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html index 838633a4f..cf4bf5125 100644 --- a/docs/resources/llama-stack-spec.html +++ b/docs/resources/llama-stack-spec.html @@ -21,7 +21,7 @@ "info": { "title": "[DRAFT] Llama Stack Specification", "version": "alpha", - "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-11-18 23:37:24.867143" + "description": "This is the specification of the llama stack that provides\n a set of endpoints and their corresponding interfaces that are tailored to\n best leverage Llama Models. The specification is still in draft and subject to change.\n Generated at 2024-11-19 09:14:01.145131" }, "servers": [ { diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml index 994e3aac4..e84f11bdd 100644 --- a/docs/resources/llama-stack-spec.yaml +++ b/docs/resources/llama-stack-spec.yaml @@ -3400,7 +3400,7 @@ info: description: "This is the specification of the llama stack that provides\n \ \ a set of endpoints and their corresponding interfaces that are tailored\ \ to\n best leverage Llama Models. The specification is still in\ - \ draft and subject to change.\n Generated at 2024-11-18 23:37:24.867143" + \ draft and subject to change.\n Generated at 2024-11-19 09:14:01.145131" title: '[DRAFT] Llama Stack Specification' version: alpha jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema diff --git a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md index f940e6de2..cca1155e1 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md @@ -25,8 +25,8 @@ The following models are available by default: - `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)` - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)` - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)` -- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)` -- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-3b-instruct)` +- `meta-llama/Llama-3.2-1B-Instruct (fireworks/llama-v3p2-1b-instruct)` +- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)` - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)` - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)` - `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)` diff --git a/docs/source/getting_started/distributions/self_hosted_distro/index.md b/docs/source/getting_started/distributions/self_hosted_distro/index.md index ed6ab5d7f..502b95cb4 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/index.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/index.md @@ -23,5 +23,6 @@ tgi dell-tgi together fireworks +remote-vllm bedrock ``` diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index eb95db7cc..5fc2c5ed8 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -53,9 +53,9 @@ Please see our pages in detail for the types of distributions we offer: 3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device. -### Quick Start Commands +### Table of Contents -Once you have decided on the inference provider and distribution to use, use the following quick start commands to get started. +Once you have decided on the inference provider and distribution to use, use the following guides to get started. ##### 1.0 Prerequisite @@ -109,421 +109,33 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew ##### 1.1. Start the distribution -**(Option 1) Via Docker** ::::{tab-set} - :::{tab-item} meta-reference-gpu -``` -$ cd llama-stack/distributions/meta-reference-gpu && docker compose up -``` - -This will download and start running a pre-built Docker container. Alternatively, you may use the following commands: - -``` -docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml -``` +- [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html) ::: :::{tab-item} vLLM -``` -$ cd llama-stack/distributions/remote-vllm && docker compose up -``` - -The script will first start up vLLM server on port 8000, then start up Llama Stack distribution server hooking up to it for inference. You should see the following outputs -- -``` - -``` - -To kill the server -``` -docker compose down -``` +- [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html) ::: :::{tab-item} tgi -``` -$ cd llama-stack/distributions/tgi && docker compose up -``` - -The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should see the following outputs -- -``` -[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama) -[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0 -[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -``` - -To kill the server -``` -docker compose down -``` -::: - - -:::{tab-item} ollama -``` -$ cd llama-stack/distributions/ollama && docker compose up - -# OR - -$ cd llama-stack/distributions/ollama-gpu && docker compose up -``` - -You will see outputs similar to following --- -``` -[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps" -[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps" -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -[llamastack] | Resolved 12 providers -[llamastack] | inner-inference => ollama0 -[llamastack] | models => __routing_table__ -[llamastack] | inference => __autorouted__ -``` - -To kill the server -``` -docker compose down -``` -::: - -:::{tab-item} fireworks -``` -$ cd llama-stack/distributions/fireworks && docker compose up -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g. -``` -inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference - api_key: -``` -::: - -:::{tab-item} together -``` -$ cd distributions/together && docker compose up -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g. -``` -inference: - - provider_id: together - provider_type: remote::together - config: - url: https://api.together.xyz/v1 - api_key: -``` -::: - - -:::: - -**(Option 2) Via Conda** - -::::{tab-set} - -:::{tab-item} meta-reference-gpu -1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html) - -2. Build the `meta-reference-gpu` distribution - -``` -$ llama stack build --template meta-reference-gpu --image-type conda -``` - -3. Start running distribution -``` -$ llama stack run ~/.llama/distributions/llamastack-meta-reference-gpu/meta-reference-gpu-run.yaml -``` - -Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section: -``` -memory: - - provider_id: faiss-0 - provider_type: faiss - config: - kvstore: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/faiss_store.db -``` - -::: - -:::{tab-item} tgi -1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html) - -2. Build the `tgi` distribution - -```bash -llama stack build --template tgi --image-type conda -``` - -3. Start a TGI server endpoint - -4. Make sure in your `run.yaml` file, your `conda_env` is pointing to the conda environment and inference provider is pointing to the correct TGI server endpoint. E.g. -``` -conda_env: llamastack-tgi -... -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 -``` - -5. Start Llama Stack server -```bash -$ llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml -``` - -Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section: -``` -memory: - - provider_id: faiss-0 - provider_type: faiss - config: - kvstore: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/faiss_store.db -``` +- [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html) ::: :::{tab-item} ollama - -If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands. - -#### Start Ollama server. -- Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details. - -**Via Docker** -``` -docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama -``` - -**Via CLI** -``` -ollama run -``` - -#### Start Llama Stack server pointing to Ollama server - -Make sure your `run.yaml` file has the inference provider pointing to the correct Ollama endpoint. E.g. -``` -conda_env: llamastack-ollama -... -inference: - - provider_id: ollama0 - provider_type: remote::ollama - config: - url: http://127.0.0.1:11434 -``` - -``` -llama stack build --template ollama --image-type conda -llama stack run ~/.llama/distributions/llamastack-ollama/ollama-run.yaml -``` - -Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section: -``` -memory: - - provider_id: faiss-0 - provider_type: faiss - config: - kvstore: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/faiss_store.db -``` - -::: - -:::{tab-item} fireworks - -```bash -llama stack build --template fireworks --image-type conda -# -- modify run.yaml to a valid Fireworks server endpoint -llama stack run ./run.yaml -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g. -``` -conda_env: llamastack-fireworks -... -inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference - api_key: -``` +- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html) ::: :::{tab-item} together - -```bash -llama stack build --template together --image-type conda -# -- modify run.yaml to a valid Together server endpoint -llama stack run ~/.llama/distributions/llamastack-together/together-run.yaml -``` - -Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g. -``` -conda_env: llamastack-together -... -inference: - - provider_id: together - provider_type: remote::together - config: - url: https://api.together.xyz/v1 - api_key: -``` -::: - -:::: - -##### 1.2 (Optional) Update Model Serving Configuration -::::{tab-set} - -:::{tab-item} meta-reference-gpu -You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`. -``` -inference: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - model: Llama3.2-11B-Vision-Instruct - quantization: null - torch_seed: null - max_seq_len: 4096 - max_batch_size: 1 -``` - -Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. -::: - -:::{tab-item} tgi -To serve a new model with `tgi`, change the docker command flag `--model-id `. - -This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve. - -``` -command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] -``` - -or by changing the docker run command's `--model-id` flag -``` -docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009 -``` - -Make sure your `run.yaml` file has the inference provider pointing to the TGI server endpoint serving your model. -``` -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 -``` -``` - -Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints. -::: - -:::{tab-item} ollama -You can use ollama for managing model downloads. - -``` -ollama pull llama3.1:8b-instruct-fp16 -ollama pull llama3.1:70b-instruct-fp16 -``` - -> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. - - -To serve a new model with `ollama` -``` -ollama run -``` - -To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama. -``` -$ ollama ps - -NAME ID SIZE PROCESSOR UNTIL -llama3.1:8b-instruct-fp16 4aacac419454 17 GB 100% GPU 4 minutes from now -``` - -To verify that the model served by ollama is correctly connected to Llama Stack server -``` -$ llama-stack-client models list -+----------------------+----------------------+---------------+-----------------------------------------------+ -| identifier | llama_model | provider_id | metadata | -+======================+======================+===============+===============================================+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0 | {'ollama_model': 'llama3.1:8b-instruct-fp16'} | -+----------------------+----------------------+---------------+-----------------------------------------------+ -``` -::: - -:::{tab-item} together -Use `llama-stack-client models list` to check the available models served by together. - -``` -$ llama-stack-client models list -+------------------------------+------------------------------+---------------+------------+ -| identifier | llama_model | provider_id | metadata | -+==============================+==============================+===============+============+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -``` +- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html) ::: :::{tab-item} fireworks -Use `llama-stack-client models list` to check the available models served by Fireworks. -``` -$ llama-stack-client models list -+------------------------------+------------------------------+---------------+------------+ -| identifier | llama_model | provider_id | metadata | -+==============================+==============================+===============+============+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-1B-Instruct | Llama3.2-1B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -``` +- [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html) ::: :::: - ##### Troubleshooting - If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue. - Use `--port ` flag to use a different port number. For docker run, update the `-p :` flag. @@ -535,10 +147,10 @@ $ llama-stack-client models list Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API: ```bash -$ curl http://localhost:5000/inference/chat_completion \ +$ curl http://localhost:5000/alpha/inference/chat-completion \ -H "Content-Type: application/json" \ -d '{ - "model_id": "Llama3.1-8B-Instruct", + "model_id": "meta-llama/Llama-3.1-8B-Instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Write me a 2 sentence poem about the moon"} diff --git a/docs/zero_to_hero_guide/quickstart.md b/docs/zero_to_hero_guide/quickstart.md index 54a01e219..df8e9abc4 100644 --- a/docs/zero_to_hero_guide/quickstart.md +++ b/docs/zero_to_hero_guide/quickstart.md @@ -22,14 +22,22 @@ If you're looking for more specific topics like tool calling or agent setup, we - Download and unzip `Ollama-darwin.zip`. - Run the `Ollama` application. -2. **Download the Ollama CLI**: +1. **Download the Ollama CLI**: - Ensure you have the `ollama` command line tool by downloading and installing it from the same website. -3. **Verify Installation**: +1. **Start ollama server**: + - Open the terminal and run: + ``` + ollama serve + ``` + +1. **Run the model**: - Open the terminal and run: ```bash - ollama run llama3.2:1b + ollama run llama3.2:3b-instruct-fp16 ``` + **Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43) + --- @@ -84,6 +92,8 @@ If you're looking for more specific topics like tool calling or agent setup, we ```bash llama stack run /path/to/your/distro/llamastack-ollama/ollama-run.yaml --port 5050 ``` + Note: + 1. Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model The server will start and listen on `http://localhost:5050`. @@ -97,7 +107,7 @@ After setting up the server, open a new terminal window and verify it's working curl http://localhost:5050/inference/chat_completion \ -H "Content-Type: application/json" \ -d '{ - "model": "llama3.2:1b", + "model": "Llama3.2-3B-Instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Write me a 2-sentence poem about the moon"} @@ -106,6 +116,8 @@ curl http://localhost:5050/inference/chat_completion \ }' ``` +You can check the available models with the command `llama-stack-client models list`. + **Expected Output:** ```json { diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index f2602ddde..25de35497 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -54,6 +54,7 @@ class ToolDefinitionCommon(BaseModel): class SearchEngineType(Enum): bing = "bing" brave = "brave" + tavily = "tavily" @json_schema_type diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py index bb57186e5..c2f8ac855 100644 --- a/llama_stack/cli/download.py +++ b/llama_stack/cli/download.py @@ -380,6 +380,7 @@ def _hf_download( def _meta_download( model: "Model", + model_id: str, meta_url: str, info: "LlamaDownloadInfo", max_concurrent_downloads: int, @@ -405,8 +406,15 @@ def _meta_download( downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads) asyncio.run(downloader.download_all(tasks)) - print(f"\nSuccessfully downloaded model to {output_dir}") - cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white") + cprint(f"\nSuccessfully downloaded model to {output_dir}", "green") + cprint( + f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}", + "white", + ) + cprint( + f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}", + "yellow", + ) class ModelEntry(BaseModel): @@ -512,7 +520,7 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser): ) if "llamameta.net" not in meta_url: parser.error("Invalid Meta URL provided") - _meta_download(model, meta_url, info, args.max_parallel) + _meta_download(model, model_id, meta_url, info, args.max_parallel) except Exception as e: parser.error(f"Download failed: {str(e)}") diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index 139883618..2730ae174 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -9,6 +9,7 @@ LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-} LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-} TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-} +BUILD_PLATFORM=${BUILD_PLATFORM:-} if [ "$#" -lt 4 ]; then echo "Usage: $0 []" >&2 @@ -96,7 +97,7 @@ else add_to_docker "RUN pip install fastapi libcst" add_to_docker < Type: break kwargs[param.name] = args[i] - url = f"{self.base_url}{webmethod.route}" + url = f"{self.base_url}/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}" def convert(value): if isinstance(value, list): diff --git a/llama_stack/distribution/server/endpoints.py b/llama_stack/distribution/server/endpoints.py index 93432abe1..af429e020 100644 --- a/llama_stack/distribution/server/endpoints.py +++ b/llama_stack/distribution/server/endpoints.py @@ -9,6 +9,8 @@ from typing import Dict, List from pydantic import BaseModel +from llama_stack.apis.version import LLAMA_STACK_API_VERSION + from llama_stack.distribution.resolver import api_protocol_map from llama_stack.providers.datatypes import Api @@ -33,7 +35,7 @@ def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]: continue webmethod = method.__webmethod__ - route = webmethod.route + route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}" if webmethod.method == "GET": method = "get" diff --git a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py index 4c9cdfcd2..a1e7d08f5 100644 --- a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py +++ b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py @@ -86,10 +86,13 @@ class PhotogenTool(SingleMessageBuiltinTool): class SearchTool(SingleMessageBuiltinTool): def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None: self.api_key = api_key + self.engine_type = engine if engine == SearchEngineType.bing: self.engine = BingSearch(api_key, **kwargs) elif engine == SearchEngineType.brave: self.engine = BraveSearch(api_key, **kwargs) + elif engine == SearchEngineType.tavily: + self.engine = TavilySearch(api_key, **kwargs) else: raise ValueError(f"Unknown search engine: {engine}") @@ -257,6 +260,21 @@ class BraveSearch: return {"query": query, "top_k": clean_response} +class TavilySearch: + def __init__(self, api_key: str) -> None: + self.api_key = api_key + + async def search(self, query: str) -> str: + response = requests.post( + "https://api.tavily.com/search", + json={"api_key": self.api_key, "query": query}, + ) + return json.dumps(self._clean_tavily_response(response.json())) + + def _clean_tavily_response(self, search_response, top_k=3): + return {"query": search_response["query"], "top_k": search_response["results"]} + + class WolframAlphaTool(SingleMessageBuiltinTool): def __init__(self, api_key: str) -> None: self.api_key = api_key diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 3ff50d378..c3e634155 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -50,11 +50,11 @@ MODEL_ALIASES = [ ), build_model_alias( "fireworks/llama-v3p2-1b-instruct", - CoreModelId.llama3_2_3b_instruct.value, + CoreModelId.llama3_2_1b_instruct.value, ), build_model_alias( "fireworks/llama-v3p2-3b-instruct", - CoreModelId.llama3_2_11b_vision_instruct.value, + CoreModelId.llama3_2_3b_instruct.value, ), build_model_alias( "fireworks/llama-v3p2-11b-vision-instruct", @@ -214,10 +214,10 @@ class FireworksInferenceAdapter( async def _to_async_generator(): if "messages" in params: - stream = await self._get_client().chat.completions.acreate(**params) + stream = self._get_client().chat.completions.acreate(**params) else: - stream = self._get_client().completion.create(**params) - for chunk in stream: + stream = self._get_client().completion.acreate(**params) + async for chunk in stream: yield chunk stream = _to_async_generator() diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 30745cb10..92492e3da 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -264,6 +264,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate): class TGIAdapter(_HfAdapter): async def initialize(self, config: TGIImplConfig) -> None: + print(f"Initializing TGI client with url={config.url}") self.client = AsyncInferenceClient(model=config.url, token=config.api_token) endpoint_info = await self.client.get_endpoint_info() self.max_tokens = endpoint_info["max_total_tokens"] diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py index 788f6cac4..3c877639c 100644 --- a/llama_stack/providers/remote/inference/vllm/vllm.py +++ b/llama_stack/providers/remote/inference/vllm/vllm.py @@ -53,6 +53,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate): self.client = None async def initialize(self) -> None: + print(f"Initializing VLLM client with base_url={self.config.url}") self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token) async def shutdown(self) -> None: diff --git a/llama_stack/providers/tests/agents/test_agents.py b/llama_stack/providers/tests/agents/test_agents.py index 60c047058..ee2f3d29f 100644 --- a/llama_stack/providers/tests/agents/test_agents.py +++ b/llama_stack/providers/tests/agents/test_agents.py @@ -68,6 +68,73 @@ def query_attachment_messages(): ] +async def create_agent_turn_with_search_tool( + agents_stack: Dict[str, object], + search_query_messages: List[object], + common_params: Dict[str, str], + search_tool_definition: SearchToolDefinition, +) -> None: + """ + Create an agent turn with a search tool. + + Args: + agents_stack (Dict[str, object]): The agents stack. + search_query_messages (List[object]): The search query messages. + common_params (Dict[str, str]): The common parameters. + search_tool_definition (SearchToolDefinition): The search tool definition. + """ + + # Create an agent with the search tool + agent_config = AgentConfig( + **{ + **common_params, + "tools": [search_tool_definition], + } + ) + + agent_id, session_id = await create_agent_session( + agents_stack.impls[Api.agents], agent_config + ) + turn_request = dict( + agent_id=agent_id, + session_id=session_id, + messages=search_query_messages, + stream=True, + ) + + turn_response = [ + chunk + async for chunk in await agents_stack.impls[Api.agents].create_agent_turn( + **turn_request + ) + ] + + assert len(turn_response) > 0 + assert all( + isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response + ) + + check_event_types(turn_response) + + # Check for tool execution events + tool_execution_events = [ + chunk + for chunk in turn_response + if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload) + and chunk.event.payload.step_details.step_type == StepType.tool_execution.value + ] + assert len(tool_execution_events) > 0, "No tool execution events found" + + # Check the tool execution details + tool_execution = tool_execution_events[0].event.payload.step_details + assert isinstance(tool_execution, ToolExecutionStep) + assert len(tool_execution.tool_calls) > 0 + assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search + assert len(tool_execution.tool_responses) > 0 + + check_turn_complete_event(turn_response, session_id, search_query_messages) + + class TestAgents: @pytest.mark.asyncio async def test_agent_turns_with_safety( @@ -215,63 +282,34 @@ class TestAgents: async def test_create_agent_turn_with_brave_search( self, agents_stack, search_query_messages, common_params ): - agents_impl = agents_stack.impls[Api.agents] - if "BRAVE_SEARCH_API_KEY" not in os.environ: pytest.skip("BRAVE_SEARCH_API_KEY not set, skipping test") - # Create an agent with Brave search tool - agent_config = AgentConfig( - **{ - **common_params, - "tools": [ - SearchToolDefinition( - type=AgentTool.brave_search.value, - api_key=os.environ["BRAVE_SEARCH_API_KEY"], - engine=SearchEngineType.brave, - ) - ], - } + search_tool_definition = SearchToolDefinition( + type=AgentTool.brave_search.value, + api_key=os.environ["BRAVE_SEARCH_API_KEY"], + engine=SearchEngineType.brave, + ) + await create_agent_turn_with_search_tool( + agents_stack, search_query_messages, common_params, search_tool_definition ) - agent_id, session_id = await create_agent_session(agents_impl, agent_config) - turn_request = dict( - agent_id=agent_id, - session_id=session_id, - messages=search_query_messages, - stream=True, + @pytest.mark.asyncio + async def test_create_agent_turn_with_tavily_search( + self, agents_stack, search_query_messages, common_params + ): + if "TAVILY_SEARCH_API_KEY" not in os.environ: + pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test") + + search_tool_definition = SearchToolDefinition( + type=AgentTool.brave_search.value, # place holder only + api_key=os.environ["TAVILY_SEARCH_API_KEY"], + engine=SearchEngineType.tavily, ) - - turn_response = [ - chunk async for chunk in await agents_impl.create_agent_turn(**turn_request) - ] - - assert len(turn_response) > 0 - assert all( - isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response + await create_agent_turn_with_search_tool( + agents_stack, search_query_messages, common_params, search_tool_definition ) - check_event_types(turn_response) - - # Check for tool execution events - tool_execution_events = [ - chunk - for chunk in turn_response - if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload) - and chunk.event.payload.step_details.step_type - == StepType.tool_execution.value - ] - assert len(tool_execution_events) > 0, "No tool execution events found" - - # Check the tool execution details - tool_execution = tool_execution_events[0].event.payload.step_details - assert isinstance(tool_execution, ToolExecutionStep) - assert len(tool_execution.tool_calls) > 0 - assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search - assert len(tool_execution.tool_responses) > 0 - - check_turn_complete_event(turn_response, session_id, search_query_messages) - def check_event_types(turn_response): event_types = [chunk.event.payload.event_type for chunk in turn_response] diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py index 7b7aca5bd..6e263432a 100644 --- a/llama_stack/providers/tests/inference/test_text_inference.py +++ b/llama_stack/providers/tests/inference/test_text_inference.py @@ -25,7 +25,11 @@ from .utils import group_chunks def get_expected_stop_reason(model: str): - return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn + return ( + StopReason.end_of_message + if ("Llama3.1" in model or "Llama-3.1" in model) + else StopReason.end_of_turn + ) @pytest.fixture @@ -34,7 +38,7 @@ def common_params(inference_model): "tool_choice": ToolChoice.auto, "tool_prompt_format": ( ToolPromptFormat.json - if "Llama3.1" in inference_model + if ("Llama3.1" in inference_model or "Llama-3.1" in inference_model) else ToolPromptFormat.python_list ), } diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py index f0d3bb4b9..b82319bd5 100644 --- a/llama_stack/scripts/distro_codegen.py +++ b/llama_stack/scripts/distro_codegen.py @@ -6,6 +6,7 @@ import concurrent.futures import importlib +import json import subprocess import sys from functools import partial @@ -14,6 +15,11 @@ from typing import Iterator from rich.progress import Progress, SpinnerColumn, TextColumn +from llama_stack.distribution.build import ( + get_provider_dependencies, + SERVER_DEPENDENCIES, +) + REPO_ROOT = Path(__file__).parent.parent.parent @@ -67,6 +73,39 @@ def check_for_changes() -> bool: return result.returncode != 0 +def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]: + try: + module_name = f"llama_stack.templates.{template_dir.name}" + module = importlib.import_module(module_name) + + if template_func := getattr(module, "get_distribution_template", None): + template = template_func() + normal_deps, special_deps = get_provider_dependencies(template.providers) + # Combine all dependencies in order: normal deps, special deps, server deps + all_deps = sorted(list(set(normal_deps + SERVER_DEPENDENCIES))) + sorted( + list(set(special_deps)) + ) + + return template.name, all_deps + except Exception: + return None, [] + return None, [] + + +def generate_dependencies_file(): + templates_dir = REPO_ROOT / "llama_stack" / "templates" + distribution_deps = {} + + for template_dir in find_template_dirs(templates_dir): + name, deps = collect_template_dependencies(template_dir) + if name: + distribution_deps[name] = deps + + deps_file = REPO_ROOT / "distributions" / "dependencies.json" + with open(deps_file, "w") as f: + json.dump(distribution_deps, f, indent=2) + + def main(): templates_dir = REPO_ROOT / "llama_stack" / "templates" @@ -88,6 +127,8 @@ def main(): list(executor.map(process_func, template_dirs)) progress.update(task, advance=len(template_dirs)) + generate_dependencies_file() + if check_for_changes(): print( "Distribution template changes detected. Please commit the changes.", diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index c9c05a8e0..6add39c3a 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -57,11 +57,11 @@ models: provider_id: null provider_model_id: fireworks/llama-v3p1-405b-instruct - metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct + model_id: meta-llama/Llama-3.2-1B-Instruct provider_id: null provider_model_id: fireworks/llama-v3p2-1b-instruct - metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct + model_id: meta-llama/Llama-3.2-3B-Instruct provider_id: null provider_model_id: fireworks/llama-v3p2-3b-instruct - metadata: {} diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 5f44c2d86..0f7602e2f 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -2,7 +2,7 @@ version: '2' name: tgi distribution_spec: description: Use (an external) TGI server for running LLM inference - docker_image: llamastack/distribution-tgi:test-0.0.52rc3 + docker_image: null providers: inference: - remote::tgi diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index b988c28e1..ebf082cd6 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -1,6 +1,6 @@ version: '2' image_name: tgi -docker_image: llamastack/distribution-tgi:test-0.0.52rc3 +docker_image: null conda_env: tgi apis: - agents diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index 485c02ad8..352afabb5 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -1,6 +1,6 @@ version: '2' image_name: tgi -docker_image: llamastack/distribution-tgi:test-0.0.52rc3 +docker_image: null conda_env: tgi apis: - agents diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index 79f2ad395..caa341df3 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate: name="tgi", distro_type="self_hosted", description="Use (an external) TGI server for running LLM inference", - docker_image="llamastack/distribution-tgi:test-0.0.52rc3", + docker_image=None, template_path=Path(__file__).parent / "doc_template.md", providers=providers, default_models=[inference_model, safety_model], diff --git a/requirements.txt b/requirements.txt index da8b8e638..fddf51880 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,8 @@ blobfile fire httpx huggingface-hub -llama-models>=0.0.50 +llama-models>=0.0.53 +llama-stack-client>=0.0.53 prompt-toolkit python-dotenv pydantic>=2 diff --git a/setup.py b/setup.py index 3145506f9..13f389a11 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def read_requirements(): setup( name="llama_stack", - version="0.0.50", + version="0.0.53", author="Meta Llama", author_email="llama-oss@meta.com", description="Llama Stack",