Merge branch 'main' into add-nvidia-inference-adapter

2025-12-16 10:12:37 +00:00 · 2024-11-20 09:37:48 -05:00 · 2024-11-20 09:37:48 -05:00 · 8a35dc8b0e
commit 8a35dc8b0e
parent 4ccf4ef641 89f5093dfc
28 changed files with 429 additions and 478 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,35 @@
+# Changelog
+
+## 0.0.53
+
+### Added
+- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
+- Persistence for registered objects with distribution
+- Ability to persist memory banks created for FAISS
+- PostgreSQL KVStore implementation
+- Environment variable placeholder support in run.yaml files
+- Comprehensive Zero-to-Hero notebooks and quickstart guides
+- Support for quantized models in Ollama
+- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
+- Bedrock distribution with safety shields support
+- Evals API with task registration and scoring functions
+- MMLU and SimpleQA benchmark scoring functions
+- Huggingface dataset provider integration for benchmarks
+- Support for custom dataset registration from local paths
+- Benchmark evaluation CLI tools with visualization tables
+- RAG evaluation scoring functions and metrics
+- Local persistence for datasets and eval tasks
+
+### Changed
+- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
+- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
+- Updated API signatures for dataset and eval task registration
+- Restructured folder organization for providers
+- Enhanced Docker build configuration
+- Added version prefixing for REST API routes
+- Enhanced evaluation task registration workflow
+- Improved benchmark evaluation output formatting
+- Restructured evals folder organization for better modularity
+
+### Removed
+- `llama stack configure` command
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,5 @@
 include requirements.txt
+include distributions/dependencies.json
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@ -0,0 +1,171 @@
+{
+  "together": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "together",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "remote-vllm": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "fireworks": [
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "fireworks-ai",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "tgi": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "meta-reference-gpu": [
+    "accelerate",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "fairscale",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "lm-format-enforcer",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "torch",
+    "torchvision",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "zmq",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ],
+  "ollama": [
+    "aiohttp",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "ollama",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
+  ]
+}
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@ -21,7 +21,7 @@
    "info": {
        "title": "[DRAFT] Llama Stack Specification",
        "version": "alpha",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-18 23:37:24.867143"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-19 09:14:01.145131"
    },
    "servers": [
        {
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@ -3400,7 +3400,7 @@ info:
  description: "This is the specification of the llama stack that provides\n     \
    \           a set of endpoints and their corresponding interfaces that are tailored\
    \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-11-18 23:37:24.867143"
+    \ draft and subject to change.\n                Generated at 2024-11-19 09:14:01.145131"
  title: '[DRAFT] Llama Stack Specification'
  version: alpha
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
--- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
@ -25,8 +25,8 @@ The following models are available by default:
 - `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)`
 - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)`
 - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)`
- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)`
- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-3b-instruct)`
+- `meta-llama/Llama-3.2-1B-Instruct (fireworks/llama-v3p2-1b-instruct)`
+- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)`
 - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)`
 - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)`
 - `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)`
--- a/docs/source/getting_started/distributions/self_hosted_distro/index.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/index.md
@ -23,5 +23,6 @@ tgi
 dell-tgi
 together
 fireworks
+remote-vllm
 bedrock
 ```
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@ -53,9 +53,9 @@ Please see our pages in detail for the types of distributions we offer:
 3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.


-### Quick Start Commands
+### Table of Contents

-Once you have decided on the inference provider and distribution to use, use the following quick start commands to get started.
+Once you have decided on the inference provider and distribution to use, use the following guides to get started.

 ##### 1.0 Prerequisite

@ -109,421 +109,33 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew

 ##### 1.1. Start the distribution

-**(Option 1) Via Docker**
 ::::{tab-set}
-
 :::{tab-item} meta-reference-gpu
-```
-$ cd llama-stack/distributions/meta-reference-gpu && docker compose up
-```
-
-This will download and start running a pre-built Docker container. Alternatively, you may use the following commands:
-
-```
-docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml
-```
+- [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
 :::

 :::{tab-item} vLLM
-```
-$ cd llama-stack/distributions/remote-vllm && docker compose up
-```
-
-The script will first start up vLLM server on port 8000, then start up Llama Stack distribution server hooking up to it for inference. You should see the following outputs --
-```
-<TO BE FILLED>
-```
-
-To kill the server
-```
-docker compose down
-```
+- [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
 :::

 :::{tab-item} tgi
-```
-$ cd llama-stack/distributions/tgi && docker compose up
-```
-
-The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should see the following outputs --
-```
-[text-generation-inference] | 2024-10-15T18:56:33.810397Z  INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
-[text-generation-inference] | 2024-10-15T18:56:33.810448Z  WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
-[text-generation-inference] | 2024-10-15T18:56:33.864143Z  INFO text_generation_router::server: router/src/server.rs:2353: Connected
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
-```
-
-To kill the server
-```
-docker compose down
-```
-:::
-
-
-:::{tab-item} ollama
-```
-$ cd llama-stack/distributions/ollama && docker compose up
-
-# OR
-
-$ cd llama-stack/distributions/ollama-gpu && docker compose up
-```
-
-You will see outputs similar to following ---
-```
-[ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
-[ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
-[llamastack] | Resolved 12 providers
-[llamastack] |  inner-inference => ollama0
-[llamastack] |  models => __routing_table__
-[llamastack] |  inference => __autorouted__
-```
-
-To kill the server
-```
-docker compose down
-```
-:::
-
-:::{tab-item} fireworks
-```
-$ cd llama-stack/distributions/fireworks && docker compose up
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
-```
-inference:
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference
-      api_key: <optional api key>
-```
-:::
-
-:::{tab-item} together
-```
-$ cd distributions/together && docker compose up
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
-```
-inference:
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: <optional api key>
-```
-:::
-
-
-::::
-
-**(Option 2) Via Conda**
-
-::::{tab-set}
-
-:::{tab-item} meta-reference-gpu
-1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
-
-2. Build the `meta-reference-gpu` distribution
-
-```
-$ llama stack build --template meta-reference-gpu --image-type conda
-```
-
-3. Start running distribution
-```
-$ llama stack run ~/.llama/distributions/llamastack-meta-reference-gpu/meta-reference-gpu-run.yaml
-```
-
-Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
-```
-memory:
-  - provider_id: faiss-0
-    provider_type: faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/faiss_store.db
-```
-
-:::
-
-:::{tab-item} tgi
-1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
-
-2. Build the `tgi` distribution
-
-```bash
-llama stack build --template tgi --image-type conda
-```
-
-3. Start a TGI server endpoint
-
-4. Make sure in your `run.yaml` file, your `conda_env` is pointing to the conda environment and inference provider is pointing to the correct TGI server endpoint. E.g.
-```
-conda_env: llamastack-tgi
-...
-inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5009
-```
-
-5. Start Llama Stack server
-```bash
-$ llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml
-```
-
-Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
-```
-memory:
-  - provider_id: faiss-0
-    provider_type: faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/faiss_store.db
-```
+- [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
 :::

 :::{tab-item} ollama
-
-If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands.
-
-#### Start Ollama server.
- Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details.
-
-**Via Docker**
-```
-docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
-```
-
-**Via CLI**
-```
-ollama run <model_id>
-```
-
-#### Start Llama Stack server pointing to Ollama server
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Ollama endpoint. E.g.
-```
-conda_env: llamastack-ollama
-...
-inference:
-  - provider_id: ollama0
-    provider_type: remote::ollama
-    config:
-      url: http://127.0.0.1:11434
-```
-
-```
-llama stack build --template ollama --image-type conda
-llama stack run ~/.llama/distributions/llamastack-ollama/ollama-run.yaml
-```
-
-Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
-```
-memory:
-  - provider_id: faiss-0
-    provider_type: faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/faiss_store.db
-```
-
-:::
-
-:::{tab-item} fireworks
-
-```bash
-llama stack build --template fireworks --image-type conda
-# -- modify run.yaml to a valid Fireworks server endpoint
-llama stack run ./run.yaml
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
-```
-conda_env: llamastack-fireworks
-...
-inference:
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference
-      api_key: <optional api key>
-```
+- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
 :::

 :::{tab-item} together
-
-```bash
-llama stack build --template together --image-type conda
-# -- modify run.yaml to a valid Together server endpoint
-llama stack run ~/.llama/distributions/llamastack-together/together-run.yaml
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
-```
-conda_env: llamastack-together
-...
-inference:
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: <optional api key>
-```
-:::
-
-::::
-
-##### 1.2 (Optional) Update Model Serving Configuration
-::::{tab-set}
-
-:::{tab-item} meta-reference-gpu
-You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`.
-```
-inference:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      model: Llama3.2-11B-Vision-Instruct
-      quantization: null
-      torch_seed: null
-      max_seq_len: 4096
-      max_batch_size: 1
-```
-
-Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-:::
-
-:::{tab-item} tgi
-To serve a new model with `tgi`, change the docker command flag `--model-id <model-to-serve>`.
-
-This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve.
-
-```
-command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
-```
-
-or by changing the docker run command's `--model-id` flag
-```
-docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the TGI server endpoint serving your model.
-```
-inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5009
-```
-```
-
-Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-:::
-
-:::{tab-item} ollama
-You can use ollama for managing model downloads.
-
-```
-ollama pull llama3.1:8b-instruct-fp16
-ollama pull llama3.1:70b-instruct-fp16
-```
-
-> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
-
-
-To serve a new model with `ollama`
-```
-ollama run <model_name>
-```
-
-To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
-```
-$ ollama ps
-
-NAME                         ID              SIZE     PROCESSOR    UNTIL
-llama3.1:8b-instruct-fp16    4aacac419454    17 GB    100% GPU     4 minutes from now
-```
-
-To verify that the model served by ollama is correctly connected to Llama Stack server
-```
-$ llama-stack-client models list
-+----------------------+----------------------+---------------+-----------------------------------------------+
-| identifier           | llama_model          | provider_id   | metadata                                      |
-+======================+======================+===============+===============================================+
-| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0       | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
-+----------------------+----------------------+---------------+-----------------------------------------------+
-```
-:::
-
-:::{tab-item} together
-Use `llama-stack-client models list` to check the available models served by together.
-
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-```
+- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
 :::

 :::{tab-item} fireworks
-Use `llama-stack-client models list` to check the available models served by Fireworks.
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-1B-Instruct         | Llama3.2-1B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-```
+- [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
 :::

 ::::

-
 ##### Troubleshooting
 - If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
 - Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.
@ -535,10 +147,10 @@ $ llama-stack-client models list
 Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API:

 ```bash
-$ curl http://localhost:5000/inference/chat_completion \
+$ curl http://localhost:5000/alpha/inference/chat-completion \
 -H "Content-Type: application/json" \
 -d '{
-    "model_id": "Llama3.1-8B-Instruct",
+    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write me a 2 sentence poem about the moon"}
--- a/docs/zero_to_hero_guide/quickstart.md
+++ b/docs/zero_to_hero_guide/quickstart.md
@ -22,14 +22,22 @@ If you're looking for more specific topics like tool calling or agent setup, we
   - Download and unzip `Ollama-darwin.zip`.
   - Run the `Ollama` application.

-2. **Download the Ollama CLI**:
+1. **Download the Ollama CLI**:
   - Ensure you have the `ollama` command line tool by downloading and installing it from the same website.

-3. **Verify Installation**:
+1. **Start ollama server**:
+   - Open the terminal and run:
+      ```
+      ollama serve
+      ```
+
+1. **Run the model**:
   - Open the terminal and run:
     ```bash
-     ollama run llama3.2:1b
+     ollama run llama3.2:3b-instruct-fp16
     ```
+     **Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
+

 ---

@ -84,6 +92,8 @@ If you're looking for more specific topics like tool calling or agent setup, we
     ```bash
     llama stack run /path/to/your/distro/llamastack-ollama/ollama-run.yaml --port 5050
     ```
+     Note:
+        1. Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model

 The server will start and listen on `http://localhost:5050`.

@ -97,7 +107,7 @@ After setting up the server, open a new terminal window and verify it's working
 curl http://localhost:5050/inference/chat_completion \
 -H "Content-Type: application/json" \
 -d '{
-    "model": "llama3.2:1b",
+    "model": "Llama3.2-3B-Instruct",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write me a 2-sentence poem about the moon"}
@ -106,6 +116,8 @@ curl http://localhost:5050/inference/chat_completion \
 }'
 ```

+You can check the available models with the command `llama-stack-client models list`.
+
 **Expected Output:**
 ```json
 {
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@ -54,6 +54,7 @@ class ToolDefinitionCommon(BaseModel):
 class SearchEngineType(Enum):
    bing = "bing"
    brave = "brave"
+    tavily = "tavily"


@json_schema_type
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@ -380,6 +380,7 @@ def _hf_download(

 def _meta_download(
    model: "Model",
+    model_id: str,
    meta_url: str,
    info: "LlamaDownloadInfo",
    max_concurrent_downloads: int,
@ -405,8 +406,15 @@ def _meta_download(
    downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
    asyncio.run(downloader.download_all(tasks))

-    print(f"\nSuccessfully downloaded model to {output_dir}")
-    cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white")
+    cprint(f"\nSuccessfully downloaded model to {output_dir}", "green")
+    cprint(
+        f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
+        "white",
+    )
+    cprint(
+        f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
+        "yellow",
+    )


 class ModelEntry(BaseModel):
@ -512,7 +520,7 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
                )
                if "llamameta.net" not in meta_url:
                    parser.error("Invalid Meta URL provided")
-                _meta_download(model, meta_url, info, args.max_parallel)
+                _meta_download(model, model_id, meta_url, info, args.max_parallel)

    except Exception as e:
        parser.error(f"Download failed: {str(e)}")
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@ -9,6 +9,7 @@
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+BUILD_PLATFORM=${BUILD_PLATFORM:-}

 if [ "$#" -lt 4 ]; then
  echo "Usage: $0 <build_name> <docker_base> <pip_dependencies> [<special_pip_deps>]" >&2
@ -96,7 +97,7 @@ else
    add_to_docker "RUN pip install fastapi libcst"
    add_to_docker <<EOF
 RUN pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
-  llama-models==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
+  llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
 EOF
  else
    add_to_docker "RUN pip install --no-cache llama-stack"
@ -116,7 +117,6 @@ RUN pip install --no-cache $models_mount
 EOF
 fi

-
 add_to_docker <<EOF

 # This would be good in production but for debugging flexibility lets not add it right now
@ -158,7 +158,9 @@ image_tag="$image_name:$version_tag"

 # Detect platform architecture
 ARCH=$(uname -m)
-if [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
+if [ -n "$BUILD_PLATFORM" ]; then
+  PLATFORM="--platform $BUILD_PLATFORM"
+elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
  PLATFORM="--platform linux/arm64"
 elif [ "$ARCH" = "x86_64" ]; then
  PLATFORM="--platform linux/amd64"
--- a/llama_stack/distribution/client.py
+++ b/llama_stack/distribution/client.py
@ -15,6 +15,8 @@ import httpx
 from pydantic import BaseModel, parse_obj_as
 from termcolor import cprint

+from llama_stack.apis.version import LLAMA_STACK_API_VERSION
+
 from llama_stack.providers.datatypes import RemoteProviderConfig

 _CLIENT_CLASSES = {}
@ -117,7 +119,7 @@ def create_api_client_class(protocol) -> Type:
                    break
                kwargs[param.name] = args[i]

-            url = f"{self.base_url}{webmethod.route}"
+            url = f"{self.base_url}/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"

            def convert(value):
                if isinstance(value, list):
--- a/llama_stack/distribution/server/endpoints.py
+++ b/llama_stack/distribution/server/endpoints.py
@ -9,6 +9,8 @@ from typing import Dict, List

 from pydantic import BaseModel

+from llama_stack.apis.version import LLAMA_STACK_API_VERSION
+
 from llama_stack.distribution.resolver import api_protocol_map

 from llama_stack.providers.datatypes import Api
@ -33,7 +35,7 @@ def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
                continue

            webmethod = method.__webmethod__
-            route = webmethod.route
+            route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"

            if webmethod.method == "GET":
                method = "get"
--- a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py
+++ b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py
@ -86,10 +86,13 @@ class PhotogenTool(SingleMessageBuiltinTool):
 class SearchTool(SingleMessageBuiltinTool):
    def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None:
        self.api_key = api_key
+        self.engine_type = engine
        if engine == SearchEngineType.bing:
            self.engine = BingSearch(api_key, **kwargs)
        elif engine == SearchEngineType.brave:
            self.engine = BraveSearch(api_key, **kwargs)
+        elif engine == SearchEngineType.tavily:
+            self.engine = TavilySearch(api_key, **kwargs)
        else:
            raise ValueError(f"Unknown search engine: {engine}")

@ -257,6 +260,21 @@ class BraveSearch:
        return {"query": query, "top_k": clean_response}


+class TavilySearch:
+    def __init__(self, api_key: str) -> None:
+        self.api_key = api_key
+
+    async def search(self, query: str) -> str:
+        response = requests.post(
+            "https://api.tavily.com/search",
+            json={"api_key": self.api_key, "query": query},
+        )
+        return json.dumps(self._clean_tavily_response(response.json()))
+
+    def _clean_tavily_response(self, search_response, top_k=3):
+        return {"query": search_response["query"], "top_k": search_response["results"]}
+
+
 class WolframAlphaTool(SingleMessageBuiltinTool):
    def __init__(self, api_key: str) -> None:
        self.api_key = api_key
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -50,11 +50,11 @@ MODEL_ALIASES = [
    ),
    build_model_alias(
        "fireworks/llama-v3p2-1b-instruct",
-        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
    ),
    build_model_alias(
        "fireworks/llama-v3p2-3b-instruct",
-        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
    ),
    build_model_alias(
        "fireworks/llama-v3p2-11b-vision-instruct",
@ -214,10 +214,10 @@ class FireworksInferenceAdapter(

        async def _to_async_generator():
            if "messages" in params:
-                stream = await self._get_client().chat.completions.acreate(**params)
+                stream = self._get_client().chat.completions.acreate(**params)
            else:
-                stream = self._get_client().completion.create(**params)
-            for chunk in stream:
+                stream = self._get_client().completion.acreate(**params)
+            async for chunk in stream:
                yield chunk

        stream = _to_async_generator()
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@ -264,6 +264,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):

 class TGIAdapter(_HfAdapter):
    async def initialize(self, config: TGIImplConfig) -> None:
+        print(f"Initializing TGI client with url={config.url}")
        self.client = AsyncInferenceClient(model=config.url, token=config.api_token)
        endpoint_info = await self.client.get_endpoint_info()
        self.max_tokens = endpoint_info["max_total_tokens"]
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@ -53,6 +53,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
        self.client = None

    async def initialize(self) -> None:
+        print(f"Initializing VLLM client with base_url={self.config.url}")
        self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)

    async def shutdown(self) -> None:
--- a/llama_stack/providers/tests/agents/test_agents.py
+++ b/llama_stack/providers/tests/agents/test_agents.py
@ -68,6 +68,73 @@ def query_attachment_messages():
    ]


+async def create_agent_turn_with_search_tool(
+    agents_stack: Dict[str, object],
+    search_query_messages: List[object],
+    common_params: Dict[str, str],
+    search_tool_definition: SearchToolDefinition,
+) -> None:
+    """
+    Create an agent turn with a search tool.
+
+    Args:
+        agents_stack (Dict[str, object]): The agents stack.
+        search_query_messages (List[object]): The search query messages.
+        common_params (Dict[str, str]): The common parameters.
+        search_tool_definition (SearchToolDefinition): The search tool definition.
+    """
+
+    # Create an agent with the search tool
+    agent_config = AgentConfig(
+        **{
+            **common_params,
+            "tools": [search_tool_definition],
+        }
+    )
+
+    agent_id, session_id = await create_agent_session(
+        agents_stack.impls[Api.agents], agent_config
+    )
+    turn_request = dict(
+        agent_id=agent_id,
+        session_id=session_id,
+        messages=search_query_messages,
+        stream=True,
+    )
+
+    turn_response = [
+        chunk
+        async for chunk in await agents_stack.impls[Api.agents].create_agent_turn(
+            **turn_request
+        )
+    ]
+
+    assert len(turn_response) > 0
+    assert all(
+        isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
+    )
+
+    check_event_types(turn_response)
+
+    # Check for tool execution events
+    tool_execution_events = [
+        chunk
+        for chunk in turn_response
+        if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
+        and chunk.event.payload.step_details.step_type == StepType.tool_execution.value
+    ]
+    assert len(tool_execution_events) > 0, "No tool execution events found"
+
+    # Check the tool execution details
+    tool_execution = tool_execution_events[0].event.payload.step_details
+    assert isinstance(tool_execution, ToolExecutionStep)
+    assert len(tool_execution.tool_calls) > 0
+    assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
+    assert len(tool_execution.tool_responses) > 0
+
+    check_turn_complete_event(turn_response, session_id, search_query_messages)
+
+
 class TestAgents:
    @pytest.mark.asyncio
    async def test_agent_turns_with_safety(
@ -215,63 +282,34 @@ class TestAgents:
    async def test_create_agent_turn_with_brave_search(
        self, agents_stack, search_query_messages, common_params
    ):
-        agents_impl = agents_stack.impls[Api.agents]
-
        if "BRAVE_SEARCH_API_KEY" not in os.environ:
            pytest.skip("BRAVE_SEARCH_API_KEY not set, skipping test")

-        # Create an agent with Brave search tool
-        agent_config = AgentConfig(
-            **{
-                **common_params,
-                "tools": [
-                    SearchToolDefinition(
-                        type=AgentTool.brave_search.value,
-                        api_key=os.environ["BRAVE_SEARCH_API_KEY"],
-                        engine=SearchEngineType.brave,
-                    )
-                ],
-            }
+        search_tool_definition = SearchToolDefinition(
+            type=AgentTool.brave_search.value,
+            api_key=os.environ["BRAVE_SEARCH_API_KEY"],
+            engine=SearchEngineType.brave,
+        )
+        await create_agent_turn_with_search_tool(
+            agents_stack, search_query_messages, common_params, search_tool_definition
        )

-        agent_id, session_id = await create_agent_session(agents_impl, agent_config)
-        turn_request = dict(
-            agent_id=agent_id,
-            session_id=session_id,
-            messages=search_query_messages,
-            stream=True,
+    @pytest.mark.asyncio
+    async def test_create_agent_turn_with_tavily_search(
+        self, agents_stack, search_query_messages, common_params
+    ):
+        if "TAVILY_SEARCH_API_KEY" not in os.environ:
+            pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
+
+        search_tool_definition = SearchToolDefinition(
+            type=AgentTool.brave_search.value,  # place holder only
+            api_key=os.environ["TAVILY_SEARCH_API_KEY"],
+            engine=SearchEngineType.tavily,
        )
-
-        turn_response = [
-            chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
-        ]
-
-        assert len(turn_response) > 0
-        assert all(
-            isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
+        await create_agent_turn_with_search_tool(
+            agents_stack, search_query_messages, common_params, search_tool_definition
        )

-        check_event_types(turn_response)
-
-        # Check for tool execution events
-        tool_execution_events = [
-            chunk
-            for chunk in turn_response
-            if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
-            and chunk.event.payload.step_details.step_type
-            == StepType.tool_execution.value
-        ]
-        assert len(tool_execution_events) > 0, "No tool execution events found"
-
-        # Check the tool execution details
-        tool_execution = tool_execution_events[0].event.payload.step_details
-        assert isinstance(tool_execution, ToolExecutionStep)
-        assert len(tool_execution.tool_calls) > 0
-        assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
-        assert len(tool_execution.tool_responses) > 0
-
-        check_turn_complete_event(turn_response, session_id, search_query_messages)
-

 def check_event_types(turn_response):
    event_types = [chunk.event.payload.event_type for chunk in turn_response]
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@ -25,7 +25,11 @@ from .utils import group_chunks


 def get_expected_stop_reason(model: str):
-    return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn
+    return (
+        StopReason.end_of_message
+        if ("Llama3.1" in model or "Llama-3.1" in model)
+        else StopReason.end_of_turn
+    )


@pytest.fixture
@ -34,7 +38,7 @@ def common_params(inference_model):
        "tool_choice": ToolChoice.auto,
        "tool_prompt_format": (
            ToolPromptFormat.json
-            if "Llama3.1" in inference_model
+            if ("Llama3.1" in inference_model or "Llama-3.1" in inference_model)
            else ToolPromptFormat.python_list
        ),
    }
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@ -6,6 +6,7 @@

 import concurrent.futures
 import importlib
+import json
 import subprocess
 import sys
 from functools import partial
@ -14,6 +15,11 @@ from typing import Iterator

 from rich.progress import Progress, SpinnerColumn, TextColumn

+from llama_stack.distribution.build import (
+    get_provider_dependencies,
+    SERVER_DEPENDENCIES,
+)
+

 REPO_ROOT = Path(__file__).parent.parent.parent

@ -67,6 +73,39 @@ def check_for_changes() -> bool:
    return result.returncode != 0


+def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]:
+    try:
+        module_name = f"llama_stack.templates.{template_dir.name}"
+        module = importlib.import_module(module_name)
+
+        if template_func := getattr(module, "get_distribution_template", None):
+            template = template_func()
+            normal_deps, special_deps = get_provider_dependencies(template.providers)
+            # Combine all dependencies in order: normal deps, special deps, server deps
+            all_deps = sorted(list(set(normal_deps + SERVER_DEPENDENCIES))) + sorted(
+                list(set(special_deps))
+            )
+
+            return template.name, all_deps
+    except Exception:
+        return None, []
+    return None, []
+
+
+def generate_dependencies_file():
+    templates_dir = REPO_ROOT / "llama_stack" / "templates"
+    distribution_deps = {}
+
+    for template_dir in find_template_dirs(templates_dir):
+        name, deps = collect_template_dependencies(template_dir)
+        if name:
+            distribution_deps[name] = deps
+
+    deps_file = REPO_ROOT / "distributions" / "dependencies.json"
+    with open(deps_file, "w") as f:
+        json.dump(distribution_deps, f, indent=2)
+
+
 def main():
    templates_dir = REPO_ROOT / "llama_stack" / "templates"

@ -88,6 +127,8 @@ def main():
            list(executor.map(process_func, template_dirs))
            progress.update(task, advance=len(template_dirs))

+    generate_dependencies_file()
+
    if check_for_changes():
        print(
            "Distribution template changes detected. Please commit the changes.",
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@ -57,11 +57,11 @@ models:
  provider_id: null
  provider_model_id: fireworks/llama-v3p1-405b-instruct
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
+  model_id: meta-llama/Llama-3.2-1B-Instruct
  provider_id: null
  provider_model_id: fireworks/llama-v3p2-1b-instruct
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  model_id: meta-llama/Llama-3.2-3B-Instruct
  provider_id: null
  provider_model_id: fireworks/llama-v3p2-3b-instruct
 - metadata: {}
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@ -2,7 +2,7 @@ version: '2'
 name: tgi
 distribution_spec:
  description: Use (an external) TGI server for running LLM inference
-  docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+  docker_image: null
  providers:
    inference:
    - remote::tgi
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: tgi
-docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+docker_image: null
 conda_env: tgi
 apis:
 - agents
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: tgi
-docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+docker_image: null
 conda_env: tgi
 apis:
 - agents
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
        name="tgi",
        distro_type="self_hosted",
        description="Use (an external) TGI server for running LLM inference",
-        docker_image="llamastack/distribution-tgi:test-0.0.52rc3",
+        docker_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,8 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.50
+llama-models>=0.0.53
+llama-stack-client>=0.0.53
 prompt-toolkit
 python-dotenv
 pydantic>=2
--- a/setup.py
+++ b/setup.py
@ -16,7 +16,7 @@ def read_requirements():

 setup(
    name="llama_stack",
-    version="0.0.50",
+    version="0.0.53",
    author="Meta Llama",
    author_email="llama-oss@meta.com",
    description="Llama Stack",