Merge branch 'main' into add-nvidia-inference-adapter

2025-08-01 16:24:44 +00:00 · 2024-11-20 09:37:48 -05:00 · 2024-11-20 09:37:48 -05:00 · 8a35dc8b0e
commit 8a35dc8b0e
parent 4ccf4ef641 89f5093dfc
28 changed files with 429 additions and 478 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,35 @@
 # Changelog
 ## 0.0.53
 ### Added
 - Resource-oriented design for models, shields, memory banks, datasets and eval tasks
 - Persistence for registered objects with distribution
 - Ability to persist memory banks created for FAISS
 - PostgreSQL KVStore implementation
 - Environment variable placeholder support in run.yaml files
 - Comprehensive Zero-to-Hero notebooks and quickstart guides
 - Support for quantized models in Ollama
 - Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
 - Bedrock distribution with safety shields support
 - Evals API with task registration and scoring functions
 - MMLU and SimpleQA benchmark scoring functions
 - Huggingface dataset provider integration for benchmarks
 - Support for custom dataset registration from local paths
 - Benchmark evaluation CLI tools with visualization tables
 - RAG evaluation scoring functions and metrics
 - Local persistence for datasets and eval tasks
 ### Changed
 - Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
 - Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
 - Updated API signatures for dataset and eval task registration
 - Restructured folder organization for providers
 - Enhanced Docker build configuration
 - Added version prefixing for REST API routes
 - Enhanced evaluation task registration workflow
 - Improved benchmark evaluation output formatting
 - Restructured evals folder organization for better modularity
 ### Removed
 - `llama stack configure` command
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,5 @@
 include requirements.txt
 include distributions/dependencies.json
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -0,0 +1,171 @@
 {
  "together": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "together",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "remote-vllm": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "openai",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "fireworks": [
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "fireworks-ai",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "tgi": [
    "aiohttp",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "huggingface_hub",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "meta-reference-gpu": [
    "accelerate",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "fairscale",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "lm-format-enforcer",
    "matplotlib",
    "nltk",
    "numpy",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "torch",
    "torchvision",
    "tqdm",
    "transformers",
    "uvicorn",
    "zmq",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ],
  "ollama": [
    "aiohttp",
    "aiosqlite",
    "blobfile",
    "chardet",
    "chromadb-client",
    "faiss-cpu",
    "fastapi",
    "fire",
    "httpx",
    "matplotlib",
    "nltk",
    "numpy",
    "ollama",
    "pandas",
    "pillow",
    "psycopg2-binary",
    "pypdf",
    "redis",
    "scikit-learn",
    "scipy",
    "sentencepiece",
    "tqdm",
    "transformers",
    "uvicorn",
    "sentence-transformers --no-deps",
    "torch --index-url https://download.pytorch.org/whl/cpu"
  ]
 }
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@ -21,7 +21,7 @@
    "info": {
        "title": "[DRAFT] Llama Stack Specification",
        "version": "alpha",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-18 23:37:24.867143"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-19 09:14:01.145131"
    },
    "servers": [
        {
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@ -3400,7 +3400,7 @@ info:
  description: "This is the specification of the llama stack that provides\n     \
    \           a set of endpoints and their corresponding interfaces that are tailored\
    \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-11-18 23:37:24.867143"
+    \ draft and subject to change.\n                Generated at 2024-11-19 09:14:01.145131"
  title: '[DRAFT] Llama Stack Specification'
  version: alpha
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
--- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
@ -25,8 +25,8 @@ The following models are available by default:
 - `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)`
 - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)`
 - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)`
+- `meta-llama/Llama-3.2-1B-Instruct (fireworks/llama-v3p2-1b-instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-3b-instruct)`
+- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)`
 - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)`
 - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)`
 - `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)`
--- a/docs/source/getting_started/distributions/self_hosted_distro/index.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/index.md
@ -23,5 +23,6 @@ tgi
 dell-tgi
 together
 fireworks
 remote-vllm
 bedrock
 ```
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -53,9 +53,9 @@ Please see our pages in detail for the types of distributions we offer:
 3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
-### Quick Start Commands
+### Table of Contents
-Once you have decided on the inference provider and distribution to use, use the following quick start commands to get started.
+Once you have decided on the inference provider and distribution to use, use the following guides to get started.
 ##### 1.0 Prerequisite
@ -109,421 +109,33 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew
 ##### 1.1. Start the distribution
 **(Option 1) Via Docker**
 ::::{tab-set}
 :::{tab-item} meta-reference-gpu
-```
+- [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
 $ cd llama-stack/distributions/meta-reference-gpu && docker compose up
 ```
 This will download and start running a pre-built Docker container. Alternatively, you may use the following commands:
 ```
 docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml
 ```
 :::
 :::{tab-item} vLLM
-```
+- [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
 $ cd llama-stack/distributions/remote-vllm && docker compose up
 ```
 The script will first start up vLLM server on port 8000, then start up Llama Stack distribution server hooking up to it for inference. You should see the following outputs --
 ```
 <TO BE FILLED>
 ```
 To kill the server
 ```
 docker compose down
 ```
 :::
 :::{tab-item} tgi
-```
+- [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
 $ cd llama-stack/distributions/tgi && docker compose up
 ```
 The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should see the following outputs --
 ```
 [text-generation-inference] | 2024-10-15T18:56:33.810397Z  INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
 [text-generation-inference] | 2024-10-15T18:56:33.810448Z  WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
 [text-generation-inference] | 2024-10-15T18:56:33.864143Z  INFO text_generation_router::server: router/src/server.rs:2353: Connected
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
 ```
 To kill the server
 ```
 docker compose down
 ```
 :::
 :::{tab-item} ollama
 ```
 $ cd llama-stack/distributions/ollama && docker compose up
 # OR
 $ cd llama-stack/distributions/ollama-gpu && docker compose up
 ```
 You will see outputs similar to following ---
 ```
 [ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
 [ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
 [llamastack] | Resolved 12 providers
 [llamastack] |  inner-inference => ollama0
 [llamastack] |  models => __routing_table__
 [llamastack] |  inference => __autorouted__
 ```
 To kill the server
 ```
 docker compose down
 ```
 :::
 :::{tab-item} fireworks
 ```
 $ cd llama-stack/distributions/fireworks && docker compose up
 ```
 Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
 ```
 inference:
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference
      api_key: <optional api key>
 ```
 :::
 :::{tab-item} together
 ```
 $ cd distributions/together && docker compose up
 ```
 Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
 ```
 inference:
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: <optional api key>
 ```
 :::
 ::::
 **(Option 2) Via Conda**
 ::::{tab-set}
 :::{tab-item} meta-reference-gpu
 1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
 2. Build the `meta-reference-gpu` distribution
 ```
 $ llama stack build --template meta-reference-gpu --image-type conda
 ```
 3. Start running distribution
 ```
 $ llama stack run ~/.llama/distributions/llamastack-meta-reference-gpu/meta-reference-gpu-run.yaml
 ```
 Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
 ```
 memory:
  - provider_id: faiss-0
    provider_type: faiss
    config:
      kvstore:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/faiss_store.db
 ```
 :::
 :::{tab-item} tgi
 1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
 2. Build the `tgi` distribution
 ```bash
 llama stack build --template tgi --image-type conda
 ```
 3. Start a TGI server endpoint
 4. Make sure in your `run.yaml` file, your `conda_env` is pointing to the conda environment and inference provider is pointing to the correct TGI server endpoint. E.g.
 ```
 conda_env: llamastack-tgi
 ...
 inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: http://127.0.0.1:5009
 ```
 5. Start Llama Stack server
 ```bash
 $ llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml
 ```
 Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
 ```
 memory:
  - provider_id: faiss-0
    provider_type: faiss
    config:
      kvstore:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/faiss_store.db
 ```
 :::
 :::{tab-item} ollama
-
+- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
 If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands.
 #### Start Ollama server.
 - Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details.
 **Via Docker**
 ```
 docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
 ```
 **Via CLI**
 ```
 ollama run <model_id>
 ```
 #### Start Llama Stack server pointing to Ollama server
 Make sure your `run.yaml` file has the inference provider pointing to the correct Ollama endpoint. E.g.
 ```
 conda_env: llamastack-ollama
 ...
 inference:
  - provider_id: ollama0
    provider_type: remote::ollama
    config:
      url: http://127.0.0.1:11434
 ```
 ```
 llama stack build --template ollama --image-type conda
 llama stack run ~/.llama/distributions/llamastack-ollama/ollama-run.yaml
 ```
 Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
 ```
 memory:
  - provider_id: faiss-0
    provider_type: faiss
    config:
      kvstore:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/faiss_store.db
 ```
 :::
 :::{tab-item} fireworks
 ```bash
 llama stack build --template fireworks --image-type conda
 # -- modify run.yaml to a valid Fireworks server endpoint
 llama stack run ./run.yaml
 ```
 Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
 ```
 conda_env: llamastack-fireworks
 ...
 inference:
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference
      api_key: <optional api key>
 ```
 :::
 :::{tab-item} together
-
+- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
 ```bash
 llama stack build --template together --image-type conda
 # -- modify run.yaml to a valid Together server endpoint
 llama stack run ~/.llama/distributions/llamastack-together/together-run.yaml
 ```
 Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
 ```
 conda_env: llamastack-together
 ...
 inference:
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: <optional api key>
 ```
 :::
 ::::
 ##### 1.2 (Optional) Update Model Serving Configuration
 ::::{tab-set}
 :::{tab-item} meta-reference-gpu
 You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`.
 ```
 inference:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      model: Llama3.2-11B-Vision-Instruct
      quantization: null
      torch_seed: null
      max_seq_len: 4096
      max_batch_size: 1
 ```
 Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 :::
 :::{tab-item} tgi
 To serve a new model with `tgi`, change the docker command flag `--model-id <model-to-serve>`.
 This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve.
 ```
 command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
 ```
 or by changing the docker run command's `--model-id` flag
 ```
 docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009
 ```
 Make sure your `run.yaml` file has the inference provider pointing to the TGI server endpoint serving your model.
 ```
 inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: http://127.0.0.1:5009
 ```
 ```
 Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
 :::
 :::{tab-item} ollama
 You can use ollama for managing model downloads.
 ```
 ollama pull llama3.1:8b-instruct-fp16
 ollama pull llama3.1:70b-instruct-fp16
 ```
 > Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
 To serve a new model with `ollama`
 ```
 ollama run <model_name>
 ```
 To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
 ```
 $ ollama ps
 NAME                         ID              SIZE     PROCESSOR    UNTIL
 llama3.1:8b-instruct-fp16    4aacac419454    17 GB    100% GPU     4 minutes from now
 ```
 To verify that the model served by ollama is correctly connected to Llama Stack server
 ```
 $ llama-stack-client models list
 +----------------------+----------------------+---------------+-----------------------------------------------+
 | identifier           | llama_model          | provider_id   | metadata                                      |
 +======================+======================+===============+===============================================+
 | Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0       | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
 +----------------------+----------------------+---------------+-----------------------------------------------+
 ```
 :::
 :::{tab-item} together
 Use `llama-stack-client models list` to check the available models served by together.
 ```
 $ llama-stack-client models list
 +------------------------------+------------------------------+---------------+------------+
 | identifier                   | llama_model                  | provider_id   | metadata   |
 +==============================+==============================+===============+============+
 | Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 ```
 :::
 :::{tab-item} fireworks
-Use `llama-stack-client models list` to check the available models served by Fireworks.
+- [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
 ```
 $ llama-stack-client models list
 +------------------------------+------------------------------+---------------+------------+
 | identifier                   | llama_model                  | provider_id   | metadata   |
 +==============================+==============================+===============+============+
 | Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-1B-Instruct         | Llama3.2-1B-Instruct         | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 ```
 :::
 ::::
 ##### Troubleshooting
 - If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
 - Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.
@ -535,10 +147,10 @@ $ llama-stack-client models list
 Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API:
 ```bash
-$ curl http://localhost:5000/inference/chat_completion \
+$ curl http://localhost:5000/alpha/inference/chat-completion \
 -H "Content-Type: application/json" \
 -d '{
-    "model_id": "Llama3.1-8B-Instruct",
+    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write me a 2 sentence poem about the moon"}
--- a/docs/zero_to_hero_guide/quickstart.md
+++ b/docs/zero_to_hero_guide/quickstart.md
@ -22,14 +22,22 @@ If you're looking for more specific topics like tool calling or agent setup, we
   - Download and unzip `Ollama-darwin.zip`.
   - Run the `Ollama` application.
-2. **Download the Ollama CLI**:
+1. **Download the Ollama CLI**:
   - Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
-3. **Verify Installation**:
+1. **Start ollama server**:
   - Open the terminal and run:
      ```
      ollama serve
      ```
 1. **Run the model**:
   - Open the terminal and run:
     ```bash
-     ollama run llama3.2:1b
+     ollama run llama3.2:3b-instruct-fp16
     ```
     **Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
 ---
@ -84,6 +92,8 @@ If you're looking for more specific topics like tool calling or agent setup, we
     ```bash
     llama stack run /path/to/your/distro/llamastack-ollama/ollama-run.yaml --port 5050
     ```
     Note:
        1. Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model
 The server will start and listen on `http://localhost:5050`.
@ -97,7 +107,7 @@ After setting up the server, open a new terminal window and verify it's working
 curl http://localhost:5050/inference/chat_completion \
 -H "Content-Type: application/json" \
 -d '{
-    "model": "llama3.2:1b",
+    "model": "Llama3.2-3B-Instruct",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write me a 2-sentence poem about the moon"}
@ -106,6 +116,8 @@ curl http://localhost:5050/inference/chat_completion \
 }'
 ```
 You can check the available models with the command `llama-stack-client models list`.
 **Expected Output:**
 ```json
 {
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -54,6 +54,7 @@ class ToolDefinitionCommon(BaseModel):
 class SearchEngineType(Enum):
    bing = "bing"
    brave = "brave"
    tavily = "tavily"
@json_schema_type
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -380,6 +380,7 @@ def _hf_download(
 def _meta_download(
    model: "Model",
    model_id: str,
    meta_url: str,
    info: "LlamaDownloadInfo",
    max_concurrent_downloads: int,
@ -405,8 +406,15 @@ def _meta_download(
    downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
    asyncio.run(downloader.download_all(tasks))
-    print(f"\nSuccessfully downloaded model to {output_dir}")
+    cprint(f"\nSuccessfully downloaded model to {output_dir}", "green")
-    cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white")
+    cprint(
        f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
        "white",
    )
    cprint(
        f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
        "yellow",
    )
 class ModelEntry(BaseModel):
@ -512,7 +520,7 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
                )
                if "llamameta.net" not in meta_url:
                    parser.error("Invalid Meta URL provided")
-                _meta_download(model, meta_url, info, args.max_parallel)
+                _meta_download(model, model_id, meta_url, info, args.max_parallel)
    except Exception as e:
        parser.error(f"Download failed: {str(e)}")
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -9,6 +9,7 @@
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
 BUILD_PLATFORM=${BUILD_PLATFORM:-}
 if [ "$#" -lt 4 ]; then
  echo "Usage: $0 <build_name> <docker_base> <pip_dependencies> [<special_pip_deps>]" >&2
@ -96,7 +97,7 @@ else
    add_to_docker "RUN pip install fastapi libcst"
    add_to_docker <<EOF
 RUN pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
-  llama-models==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
+  llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
 EOF
  else
    add_to_docker "RUN pip install --no-cache llama-stack"
@ -116,7 +117,6 @@ RUN pip install --no-cache $models_mount
 EOF
 fi
 add_to_docker <<EOF
 # This would be good in production but for debugging flexibility lets not add it right now
@ -158,7 +158,9 @@ image_tag="$image_name:$version_tag"
 # Detect platform architecture
 ARCH=$(uname -m)
-if [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
+if [ -n "$BUILD_PLATFORM" ]; then
  PLATFORM="--platform $BUILD_PLATFORM"
 elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
  PLATFORM="--platform linux/arm64"
 elif [ "$ARCH" = "x86_64" ]; then
  PLATFORM="--platform linux/amd64"
--- a/llama_stack/distribution/client.py
+++ b/llama_stack/distribution/client.py
@ -15,6 +15,8 @@ import httpx
 from pydantic import BaseModel, parse_obj_as
 from termcolor import cprint
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION
 from llama_stack.providers.datatypes import RemoteProviderConfig
 _CLIENT_CLASSES = {}
@ -117,7 +119,7 @@ def create_api_client_class(protocol) -> Type:
                    break
                kwargs[param.name] = args[i]
-            url = f"{self.base_url}{webmethod.route}"
+            url = f"{self.base_url}/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
            def convert(value):
                if isinstance(value, list):
--- a/llama_stack/distribution/server/endpoints.py
+++ b/llama_stack/distribution/server/endpoints.py
@ -9,6 +9,8 @@ from typing import Dict, List
 from pydantic import BaseModel
 from llama_stack.apis.version import LLAMA_STACK_API_VERSION
 from llama_stack.distribution.resolver import api_protocol_map
 from llama_stack.providers.datatypes import Api
@ -33,7 +35,7 @@ def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
                continue
            webmethod = method.__webmethod__
-            route = webmethod.route
+            route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
            if webmethod.method == "GET":
                method = "get"
--- a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py
+++ b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py
@ -86,10 +86,13 @@ class PhotogenTool(SingleMessageBuiltinTool):
 class SearchTool(SingleMessageBuiltinTool):
    def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None:
        self.api_key = api_key
        self.engine_type = engine
        if engine == SearchEngineType.bing:
            self.engine = BingSearch(api_key, **kwargs)
        elif engine == SearchEngineType.brave:
            self.engine = BraveSearch(api_key, **kwargs)
        elif engine == SearchEngineType.tavily:
            self.engine = TavilySearch(api_key, **kwargs)
        else:
            raise ValueError(f"Unknown search engine: {engine}")
@ -257,6 +260,21 @@ class BraveSearch:
        return {"query": query, "top_k": clean_response}
 class TavilySearch:
    def __init__(self, api_key: str) -> None:
        self.api_key = api_key
    async def search(self, query: str) -> str:
        response = requests.post(
            "https://api.tavily.com/search",
            json={"api_key": self.api_key, "query": query},
        )
        return json.dumps(self._clean_tavily_response(response.json()))
    def _clean_tavily_response(self, search_response, top_k=3):
        return {"query": search_response["query"], "top_k": search_response["results"]}
 class WolframAlphaTool(SingleMessageBuiltinTool):
    def __init__(self, api_key: str) -> None:
        self.api_key = api_key
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -50,11 +50,11 @@ MODEL_ALIASES = [
    ),
    build_model_alias(
        "fireworks/llama-v3p2-1b-instruct",
-        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
    ),
    build_model_alias(
        "fireworks/llama-v3p2-3b-instruct",
-        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
    ),
    build_model_alias(
        "fireworks/llama-v3p2-11b-vision-instruct",
@ -214,10 +214,10 @@ class FireworksInferenceAdapter(
        async def _to_async_generator():
            if "messages" in params:
-                stream = await self._get_client().chat.completions.acreate(**params)
+                stream = self._get_client().chat.completions.acreate(**params)
            else:
-                stream = self._get_client().completion.create(**params)
+                stream = self._get_client().completion.acreate(**params)
-            for chunk in stream:
+            async for chunk in stream:
                yield chunk
        stream = _to_async_generator()
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -264,6 +264,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
 class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
        print(f"Initializing TGI client with url={config.url}")
        self.client = AsyncInferenceClient(model=config.url, token=config.api_token)
        endpoint_info = await self.client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -53,6 +53,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        self.client = None
    async def initialize(self) -> None:
        print(f"Initializing VLLM client with base_url={self.config.url}")
        self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
    async def shutdown(self) -> None:
--- a/llama_stack/providers/tests/agents/test_agents.py
+++ b/llama_stack/providers/tests/agents/test_agents.py
@ -68,6 +68,73 @@ def query_attachment_messages():
    ]
 async def create_agent_turn_with_search_tool(
    agents_stack: Dict[str, object],
    search_query_messages: List[object],
    common_params: Dict[str, str],
    search_tool_definition: SearchToolDefinition,
 ) -> None:
    """
    Create an agent turn with a search tool.
    Args:
        agents_stack (Dict[str, object]): The agents stack.
        search_query_messages (List[object]): The search query messages.
        common_params (Dict[str, str]): The common parameters.
        search_tool_definition (SearchToolDefinition): The search tool definition.
    """
    # Create an agent with the search tool
    agent_config = AgentConfig(
        **{
            **common_params,
            "tools": [search_tool_definition],
        }
    )
    agent_id, session_id = await create_agent_session(
        agents_stack.impls[Api.agents], agent_config
    )
    turn_request = dict(
        agent_id=agent_id,
        session_id=session_id,
        messages=search_query_messages,
        stream=True,
    )
    turn_response = [
        chunk
        async for chunk in await agents_stack.impls[Api.agents].create_agent_turn(
            **turn_request
        )
    ]
    assert len(turn_response) > 0
    assert all(
        isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
    )
    check_event_types(turn_response)
    # Check for tool execution events
    tool_execution_events = [
        chunk
        for chunk in turn_response
        if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
        and chunk.event.payload.step_details.step_type == StepType.tool_execution.value
    ]
    assert len(tool_execution_events) > 0, "No tool execution events found"
    # Check the tool execution details
    tool_execution = tool_execution_events[0].event.payload.step_details
    assert isinstance(tool_execution, ToolExecutionStep)
    assert len(tool_execution.tool_calls) > 0
    assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
    assert len(tool_execution.tool_responses) > 0
    check_turn_complete_event(turn_response, session_id, search_query_messages)
 class TestAgents:
    @pytest.mark.asyncio
    async def test_agent_turns_with_safety(
@ -215,63 +282,34 @@ class TestAgents:
    async def test_create_agent_turn_with_brave_search(
        self, agents_stack, search_query_messages, common_params
    ):
        agents_impl = agents_stack.impls[Api.agents]
        if "BRAVE_SEARCH_API_KEY" not in os.environ:
            pytest.skip("BRAVE_SEARCH_API_KEY not set, skipping test")
-        # Create an agent with Brave search tool
+        search_tool_definition = SearchToolDefinition(
-        agent_config = AgentConfig(
+            type=AgentTool.brave_search.value,
-            **{
+            api_key=os.environ["BRAVE_SEARCH_API_KEY"],
-                **common_params,
+            engine=SearchEngineType.brave,
-                "tools": [
+        )
-                    SearchToolDefinition(
+        await create_agent_turn_with_search_tool(
-                        type=AgentTool.brave_search.value,
+            agents_stack, search_query_messages, common_params, search_tool_definition
                        api_key=os.environ["BRAVE_SEARCH_API_KEY"],
                        engine=SearchEngineType.brave,
                    )
                ],
            }
        )
-        agent_id, session_id = await create_agent_session(agents_impl, agent_config)
+    @pytest.mark.asyncio
-        turn_request = dict(
+    async def test_create_agent_turn_with_tavily_search(
-            agent_id=agent_id,
+        self, agents_stack, search_query_messages, common_params
-            session_id=session_id,
+    ):
-            messages=search_query_messages,
+        if "TAVILY_SEARCH_API_KEY" not in os.environ:
-            stream=True,
+            pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
        search_tool_definition = SearchToolDefinition(
            type=AgentTool.brave_search.value,  # place holder only
            api_key=os.environ["TAVILY_SEARCH_API_KEY"],
            engine=SearchEngineType.tavily,
        )
-
+        await create_agent_turn_with_search_tool(
-        turn_response = [
+            agents_stack, search_query_messages, common_params, search_tool_definition
            chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
        ]
        assert len(turn_response) > 0
        assert all(
            isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
        )
        check_event_types(turn_response)
        # Check for tool execution events
        tool_execution_events = [
            chunk
            for chunk in turn_response
            if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
            and chunk.event.payload.step_details.step_type
            == StepType.tool_execution.value
        ]
        assert len(tool_execution_events) > 0, "No tool execution events found"
        # Check the tool execution details
        tool_execution = tool_execution_events[0].event.payload.step_details
        assert isinstance(tool_execution, ToolExecutionStep)
        assert len(tool_execution.tool_calls) > 0
        assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
        assert len(tool_execution.tool_responses) > 0
        check_turn_complete_event(turn_response, session_id, search_query_messages)
 def check_event_types(turn_response):
    event_types = [chunk.event.payload.event_type for chunk in turn_response]
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@ -25,7 +25,11 @@ from .utils import group_chunks
 def get_expected_stop_reason(model: str):
-    return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn
+    return (
        StopReason.end_of_message
        if ("Llama3.1" in model or "Llama-3.1" in model)
        else StopReason.end_of_turn
    )
@pytest.fixture
@ -34,7 +38,7 @@ def common_params(inference_model):
        "tool_choice": ToolChoice.auto,
        "tool_prompt_format": (
            ToolPromptFormat.json
-            if "Llama3.1" in inference_model
+            if ("Llama3.1" in inference_model or "Llama-3.1" in inference_model)
            else ToolPromptFormat.python_list
        ),
    }
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@ -6,6 +6,7 @@
 import concurrent.futures
 import importlib
 import json
 import subprocess
 import sys
 from functools import partial
@ -14,6 +15,11 @@ from typing import Iterator
 from rich.progress import Progress, SpinnerColumn, TextColumn
 from llama_stack.distribution.build import (
    get_provider_dependencies,
    SERVER_DEPENDENCIES,
 )
 REPO_ROOT = Path(__file__).parent.parent.parent
@ -67,6 +73,39 @@ def check_for_changes() -> bool:
    return result.returncode != 0
 def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]:
    try:
        module_name = f"llama_stack.templates.{template_dir.name}"
        module = importlib.import_module(module_name)
        if template_func := getattr(module, "get_distribution_template", None):
            template = template_func()
            normal_deps, special_deps = get_provider_dependencies(template.providers)
            # Combine all dependencies in order: normal deps, special deps, server deps
            all_deps = sorted(list(set(normal_deps + SERVER_DEPENDENCIES))) + sorted(
                list(set(special_deps))
            )
            return template.name, all_deps
    except Exception:
        return None, []
    return None, []
 def generate_dependencies_file():
    templates_dir = REPO_ROOT / "llama_stack" / "templates"
    distribution_deps = {}
    for template_dir in find_template_dirs(templates_dir):
        name, deps = collect_template_dependencies(template_dir)
        if name:
            distribution_deps[name] = deps
    deps_file = REPO_ROOT / "distributions" / "dependencies.json"
    with open(deps_file, "w") as f:
        json.dump(distribution_deps, f, indent=2)
 def main():
    templates_dir = REPO_ROOT / "llama_stack" / "templates"
@ -88,6 +127,8 @@ def main():
            list(executor.map(process_func, template_dirs))
            progress.update(task, advance=len(template_dirs))
    generate_dependencies_file()
    if check_for_changes():
        print(
            "Distribution template changes detected. Please commit the changes.",
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -57,11 +57,11 @@ models:
  provider_id: null
  provider_model_id: fireworks/llama-v3p1-405b-instruct
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
+  model_id: meta-llama/Llama-3.2-1B-Instruct
  provider_id: null
  provider_model_id: fireworks/llama-v3p2-1b-instruct
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  model_id: meta-llama/Llama-3.2-3B-Instruct
  provider_id: null
  provider_model_id: fireworks/llama-v3p2-3b-instruct
 - metadata: {}
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@ -2,7 +2,7 @@ version: '2'
 name: tgi
 distribution_spec:
  description: Use (an external) TGI server for running LLM inference
-  docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+  docker_image: null
  providers:
    inference:
    - remote::tgi
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: tgi
-docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+docker_image: null
 conda_env: tgi
 apis:
 - agents
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: tgi
-docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+docker_image: null
 conda_env: tgi
 apis:
 - agents
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
        name="tgi",
        distro_type="self_hosted",
        description="Use (an external) TGI server for running LLM inference",
-        docker_image="llamastack/distribution-tgi:test-0.0.52rc3",
+        docker_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,8 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.50
+llama-models>=0.0.53
 llama-stack-client>=0.0.53
 prompt-toolkit
 python-dotenv
 pydantic>=2
--- a/setup.py
+++ b/setup.py
@ -16,7 +16,7 @@ def read_requirements():
 setup(
    name="llama_stack",
-    version="0.0.50",
+    version="0.0.53",
    author="Meta Llama",
    author_email="llama-oss@meta.com",
    description="Llama Stack",