From 5e4ac1b7c1feee0e770a4149bafa9c6bb7ac812f Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 09:15:05 -0800
Subject: [PATCH 01/22] Make sure server code uses version prefixed routes

---
 docs/resources/llama-stack-spec.html         | 2 +-
 docs/resources/llama-stack-spec.yaml         | 2 +-
 llama_stack/distribution/server/endpoints.py | 4 +++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 838633a4f..cf4bf5125 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -21,7 +21,7 @@
     "info": {
         "title": "[DRAFT] Llama Stack Specification",
         "version": "alpha",
-        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-18 23:37:24.867143"
+        "description": "This is the specification of the llama stack that provides\n                a set of endpoints and their corresponding interfaces that are tailored to\n                best leverage Llama Models. The specification is still in draft and subject to change.\n                Generated at 2024-11-19 09:14:01.145131"
     },
     "servers": [
         {
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 994e3aac4..e84f11bdd 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -3400,7 +3400,7 @@ info:
   description: "This is the specification of the llama stack that provides\n     \
     \           a set of endpoints and their corresponding interfaces that are tailored\
     \ to\n                best leverage Llama Models. The specification is still in\
-    \ draft and subject to change.\n                Generated at 2024-11-18 23:37:24.867143"
+    \ draft and subject to change.\n                Generated at 2024-11-19 09:14:01.145131"
   title: '[DRAFT] Llama Stack Specification'
   version: alpha
 jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
diff --git a/llama_stack/distribution/server/endpoints.py b/llama_stack/distribution/server/endpoints.py
index 93432abe1..af429e020 100644
--- a/llama_stack/distribution/server/endpoints.py
+++ b/llama_stack/distribution/server/endpoints.py
@@ -9,6 +9,8 @@ from typing import Dict, List
 
 from pydantic import BaseModel
 
+from llama_stack.apis.version import LLAMA_STACK_API_VERSION
+
 from llama_stack.distribution.resolver import api_protocol_map
 
 from llama_stack.providers.datatypes import Api
@@ -33,7 +35,7 @@ def get_all_api_endpoints() -> Dict[Api, List[ApiEndpoint]]:
                 continue
 
             webmethod = method.__webmethod__
-            route = webmethod.route
+            route = f"/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
 
             if webmethod.method == "GET":
                 method = "get"

From 1619d37cc653cb1d9cbddcbc5627cd818b11b3e6 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 09:54:30 -0800
Subject: [PATCH 02/22] codegen per-distro dependencies; not hooked into
 setup.py yet

---
 MANIFEST.in                           |   1 +
 distributions/dependencies.json       | 177 ++++++++++++++++++++++++++
 llama_stack/scripts/distro_codegen.py |  38 ++++++
 3 files changed, 216 insertions(+)
 create mode 100644 distributions/dependencies.json

diff --git a/MANIFEST.in b/MANIFEST.in
index 27cb775f7..4d1843051 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include requirements.txt
+include distributions/dependencies.json
 include llama_stack/distribution/*.sh
 include llama_stack/cli/scripts/*.sh
 include llama_stack/templates/*/*.yaml
diff --git a/distributions/dependencies.json b/distributions/dependencies.json
new file mode 100644
index 000000000..6827af1f1
--- /dev/null
+++ b/distributions/dependencies.json
@@ -0,0 +1,177 @@
+{
+  "together": [
+    "scipy",
+    "scikit-learn",
+    "nltk",
+    "chardet",
+    "chromadb-client",
+    "psycopg2-binary",
+    "sentencepiece",
+    "faiss-cpu",
+    "blobfile",
+    "pandas",
+    "pillow",
+    "together",
+    "pypdf",
+    "matplotlib",
+    "aiosqlite",
+    "redis",
+    "transformers",
+    "numpy",
+    "tqdm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu",
+    "aiosqlite",
+    "fastapi",
+    "fire",
+    "httpx",
+    "uvicorn"
+  ],
+  "remote-vllm": [
+    "scipy",
+    "scikit-learn",
+    "nltk",
+    "chardet",
+    "chromadb-client",
+    "psycopg2-binary",
+    "sentencepiece",
+    "faiss-cpu",
+    "blobfile",
+    "pandas",
+    "pillow",
+    "pypdf",
+    "matplotlib",
+    "openai",
+    "aiosqlite",
+    "redis",
+    "transformers",
+    "numpy",
+    "tqdm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu",
+    "aiosqlite",
+    "fastapi",
+    "fire",
+    "httpx",
+    "uvicorn"
+  ],
+  "fireworks": [
+    "scipy",
+    "scikit-learn",
+    "nltk",
+    "chardet",
+    "chromadb-client",
+    "psycopg2-binary",
+    "sentencepiece",
+    "faiss-cpu",
+    "blobfile",
+    "pandas",
+    "pillow",
+    "pypdf",
+    "matplotlib",
+    "aiosqlite",
+    "redis",
+    "transformers",
+    "fireworks-ai",
+    "numpy",
+    "tqdm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu",
+    "aiosqlite",
+    "fastapi",
+    "fire",
+    "httpx",
+    "uvicorn"
+  ],
+  "tgi": [
+    "scipy",
+    "scikit-learn",
+    "nltk",
+    "aiohttp",
+    "chardet",
+    "chromadb-client",
+    "psycopg2-binary",
+    "huggingface_hub",
+    "sentencepiece",
+    "faiss-cpu",
+    "blobfile",
+    "pandas",
+    "pillow",
+    "pypdf",
+    "matplotlib",
+    "aiosqlite",
+    "transformers",
+    "redis",
+    "numpy",
+    "tqdm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu",
+    "aiosqlite",
+    "fastapi",
+    "fire",
+    "httpx",
+    "uvicorn"
+  ],
+  "meta-reference-gpu": [
+    "lm-format-enforcer",
+    "scipy",
+    "scikit-learn",
+    "nltk",
+    "accelerate",
+    "chardet",
+    "chromadb-client",
+    "psycopg2-binary",
+    "sentencepiece",
+    "zmq",
+    "faiss-cpu",
+    "torchvision",
+    "blobfile",
+    "fairscale",
+    "pandas",
+    "pillow",
+    "pypdf",
+    "matplotlib",
+    "transformers",
+    "torch",
+    "aiosqlite",
+    "redis",
+    "numpy",
+    "tqdm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu",
+    "aiosqlite",
+    "fastapi",
+    "fire",
+    "httpx",
+    "uvicorn"
+  ],
+  "ollama": [
+    "scipy",
+    "scikit-learn",
+    "nltk",
+    "aiohttp",
+    "ollama",
+    "chardet",
+    "chromadb-client",
+    "psycopg2-binary",
+    "sentencepiece",
+    "faiss-cpu",
+    "blobfile",
+    "pandas",
+    "pillow",
+    "pypdf",
+    "matplotlib",
+    "aiosqlite",
+    "transformers",
+    "redis",
+    "numpy",
+    "tqdm",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu",
+    "aiosqlite",
+    "fastapi",
+    "fire",
+    "httpx",
+    "uvicorn"
+  ]
+}
diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py
index f0d3bb4b9..8bcf97374 100644
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@@ -6,6 +6,7 @@
 
 import concurrent.futures
 import importlib
+import json
 import subprocess
 import sys
 from functools import partial
@@ -14,6 +15,11 @@ from typing import Iterator
 
 from rich.progress import Progress, SpinnerColumn, TextColumn
 
+from llama_stack.distribution.build import (
+    get_provider_dependencies,
+    SERVER_DEPENDENCIES,
+)
+
 
 REPO_ROOT = Path(__file__).parent.parent.parent
 
@@ -67,6 +73,36 @@ def check_for_changes() -> bool:
     return result.returncode != 0
 
 
+def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]:
+    try:
+        module_name = f"llama_stack.templates.{template_dir.name}"
+        module = importlib.import_module(module_name)
+
+        if template_func := getattr(module, "get_distribution_template", None):
+            template = template_func()
+            normal_deps, special_deps = get_provider_dependencies(template.providers)
+            # Combine all dependencies in order: normal deps, special deps, server deps
+            all_deps = normal_deps + special_deps + SERVER_DEPENDENCIES
+            return template.name, all_deps
+    except Exception:
+        return None, []
+    return None, []
+
+
+def generate_dependencies_file():
+    templates_dir = REPO_ROOT / "llama_stack" / "templates"
+    distribution_deps = {}
+
+    for template_dir in find_template_dirs(templates_dir):
+        name, deps = collect_template_dependencies(template_dir)
+        if name:
+            distribution_deps[name] = deps
+
+    deps_file = REPO_ROOT / "distributions" / "dependencies.json"
+    with open(deps_file, "w") as f:
+        json.dump(distribution_deps, f, indent=2)
+
+
 def main():
     templates_dir = REPO_ROOT / "llama_stack" / "templates"
 
@@ -88,6 +124,8 @@ def main():
             list(executor.map(process_func, template_dirs))
             progress.update(task, advance=len(template_dirs))
 
+    generate_dependencies_file()
+
     if check_for_changes():
         print(
             "Distribution template changes detected. Please commit the changes.",

From 1b0f5fff5ae36f765d24bfaab24bc305ede5ebe3 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 19 Nov 2024 10:26:05 -0800
Subject: [PATCH 03/22] fix curl endpoint

---
 docs/source/getting_started/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index eb95db7cc..189bd6cb5 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -535,10 +535,10 @@ $ llama-stack-client models list
 Once the server is set up, we can test it with a client to verify it's working correctly. The following command will send a chat completion request to the server's `/inference/chat_completion` API:
 
 ```bash
-$ curl http://localhost:5000/inference/chat_completion \
+$ curl http://localhost:5000/alpha/inference/chat-completion \
 -H "Content-Type: application/json" \
 -d '{
-    "model_id": "Llama3.1-8B-Instruct",
+    "model_id": "meta-llama/Llama-3.1-8B-Instruct",
     "messages": [
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "Write me a 2 sentence poem about the moon"}

From 39e99b39fe60b0064f91cacd52911b9863da54c9 Mon Sep 17 00:00:00 2001
From: Henry Tai <chuenlok@users.noreply.github.com>
Date: Wed, 20 Nov 2024 02:32:19 +0800
Subject: [PATCH 04/22] update quick start to have the working instruction
 (#467)

# What does this PR do?

Fix the instruction in quickstart readme so the new developers/users can
run it without issues.

## Test Plan
None

## Sources

Please link relevant resources if necessary.


## Before submitting

- [X] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [X] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [X] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

Co-authored-by: Henry Tai <henrytai@fb.com>
---
 docs/zero_to_hero_guide/quickstart.md | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/zero_to_hero_guide/quickstart.md b/docs/zero_to_hero_guide/quickstart.md
index 54a01e219..df8e9abc4 100644
--- a/docs/zero_to_hero_guide/quickstart.md
+++ b/docs/zero_to_hero_guide/quickstart.md
@@ -22,14 +22,22 @@ If you're looking for more specific topics like tool calling or agent setup, we
    - Download and unzip `Ollama-darwin.zip`.
    - Run the `Ollama` application.
 
-2. **Download the Ollama CLI**:
+1. **Download the Ollama CLI**:
    - Ensure you have the `ollama` command line tool by downloading and installing it from the same website.
 
-3. **Verify Installation**:
+1. **Start ollama server**:
+   - Open the terminal and run:
+      ```
+      ollama serve
+      ```
+
+1. **Run the model**:
    - Open the terminal and run:
      ```bash
-     ollama run llama3.2:1b
+     ollama run llama3.2:3b-instruct-fp16
      ```
+     **Note**: The supported models for llama stack for now is listed in [here](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L43)
+
 
 ---
 
@@ -84,6 +92,8 @@ If you're looking for more specific topics like tool calling or agent setup, we
      ```bash
      llama stack run /path/to/your/distro/llamastack-ollama/ollama-run.yaml --port 5050
      ```
+     Note:
+        1. Everytime you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model
 
 The server will start and listen on `http://localhost:5050`.
 
@@ -97,7 +107,7 @@ After setting up the server, open a new terminal window and verify it's working
 curl http://localhost:5050/inference/chat_completion \
 -H "Content-Type: application/json" \
 -d '{
-    "model": "llama3.2:1b",
+    "model": "Llama3.2-3B-Instruct",
     "messages": [
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "Write me a 2-sentence poem about the moon"}
@@ -106,6 +116,8 @@ curl http://localhost:5050/inference/chat_completion \
 }'
 ```
 
+You can check the available models with the command `llama-stack-client models list`.
+
 **Expected Output:**
 ```json
 {

From c46b462c229c933ed4d5006fcb5951573abd17c6 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 11:36:53 -0800
Subject: [PATCH 05/22] Updates to docker build script

---
 llama_stack/distribution/build_container.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index 139883618..b56c76ebd 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -9,6 +9,7 @@
 LLAMA_MODELS_DIR=${LLAMA_MODELS_DIR:-}
 LLAMA_STACK_DIR=${LLAMA_STACK_DIR:-}
 TEST_PYPI_VERSION=${TEST_PYPI_VERSION:-}
+BUILD_PLATFORM=${BUILD_PLATFORM:-}
 
 if [ "$#" -lt 4 ]; then
   echo "Usage: $0 <build_name> <docker_base> <pip_dependencies> [<special_pip_deps>]" >&2
@@ -77,6 +78,10 @@ if [ -n "$special_pip_deps" ]; then
   done
 fi
 
+# This has been added to simplify UI development, but we likely need
+# to add this as a dependency to `llama-stack` itself
+add_to_docker "RUN pip install llama-stack-client"
+
 stack_mount="/app/llama-stack-source"
 models_mount="/app/llama-models-source"
 
@@ -116,7 +121,6 @@ RUN pip install --no-cache $models_mount
 EOF
 fi
 
-
 add_to_docker <<EOF
 
 # This would be good in production but for debugging flexibility lets not add it right now
@@ -158,7 +162,9 @@ image_tag="$image_name:$version_tag"
 
 # Detect platform architecture
 ARCH=$(uname -m)
-if [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
+if [ -n "$BUILD_PLATFORM" ]; then
+  PLATFORM="--platform $BUILD_PLATFORM"
+elif [ "$ARCH" = "arm64" ] || [ "$ARCH" = "aarch64" ]; then
   PLATFORM="--platform linux/arm64"
 elif [ "$ARCH" = "x86_64" ]; then
   PLATFORM="--platform linux/amd64"

From 394519d68a94c00ba5e23f4bc5e6f0b8bda3435a Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 11:44:35 -0800
Subject: [PATCH 06/22] Add llama-stack-client as a legitimate dependency for
 llama-stack

---
 llama_stack/distribution/build_container.sh | 4 ----
 requirements.txt                            | 1 +
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index b56c76ebd..230ca34ac 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -78,10 +78,6 @@ if [ -n "$special_pip_deps" ]; then
   done
 fi
 
-# This has been added to simplify UI development, but we likely need
-# to add this as a dependency to `llama-stack` itself
-add_to_docker "RUN pip install llama-stack-client"
-
 stack_mount="/app/llama-stack-source"
 models_mount="/app/llama-models-source"
 
diff --git a/requirements.txt b/requirements.txt
index da8b8e638..dcb30d605 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ fire
 httpx
 huggingface-hub
 llama-models>=0.0.50
+llama-stack-client>=0.0.50
 prompt-toolkit
 python-dotenv
 pydantic>=2

From 05d1ead02f8ee2c3ff34be9fb89d9a5e6bf91e7a Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 13:25:36 -0800
Subject: [PATCH 07/22] Update condition in tests to handle llama-3.1 vs
 llama3.1 (HF names)

---
 .../providers/tests/inference/test_text_inference.py      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llama_stack/providers/tests/inference/test_text_inference.py b/llama_stack/providers/tests/inference/test_text_inference.py
index 7b7aca5bd..6e263432a 100644
--- a/llama_stack/providers/tests/inference/test_text_inference.py
+++ b/llama_stack/providers/tests/inference/test_text_inference.py
@@ -25,7 +25,11 @@ from .utils import group_chunks
 
 
 def get_expected_stop_reason(model: str):
-    return StopReason.end_of_message if "Llama3.1" in model else StopReason.end_of_turn
+    return (
+        StopReason.end_of_message
+        if ("Llama3.1" in model or "Llama-3.1" in model)
+        else StopReason.end_of_turn
+    )
 
 
 @pytest.fixture
@@ -34,7 +38,7 @@ def common_params(inference_model):
         "tool_choice": ToolChoice.auto,
         "tool_prompt_format": (
             ToolPromptFormat.json
-            if "Llama3.1" in inference_model
+            if ("Llama3.1" in inference_model or "Llama-3.1" in inference_model)
             else ToolPromptFormat.python_list
         ),
     }

From 38ba3b9f0ce33fe546ac82b94834590064175e4d Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 13:36:14 -0800
Subject: [PATCH 08/22] Fix fireworks stream completion

---
 .../providers/remote/inference/fireworks/fireworks.py       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index 3ff50d378..02d4b82ef 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -214,10 +214,10 @@ class FireworksInferenceAdapter(
 
         async def _to_async_generator():
             if "messages" in params:
-                stream = await self._get_client().chat.completions.acreate(**params)
+                stream = self._get_client().chat.completions.acreate(**params)
             else:
-                stream = self._get_client().completion.create(**params)
-            for chunk in stream:
+                stream = self._get_client().completion.acreate(**params)
+            async for chunk in stream:
                 yield chunk
 
         stream = _to_async_generator()

From 185df4b568bf2faac2671bf0c046cf584670c812 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 19 Nov 2024 14:09:00 -0800
Subject: [PATCH 09/22] fix fireworks registration

---
 llama_stack/providers/remote/inference/fireworks/fireworks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index 02d4b82ef..d8cbca5f9 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -54,7 +54,7 @@ MODEL_ALIASES = [
     ),
     build_model_alias(
         "fireworks/llama-v3p2-3b-instruct",
-        CoreModelId.llama3_2_11b_vision_instruct.value,
+        CoreModelId.llama3_2_3b_instruct.value,
     ),
     build_model_alias(
         "fireworks/llama-v3p2-11b-vision-instruct",

From 189df6358af28dc7588b2035207180027818ddab Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 19 Nov 2024 14:16:00 -0800
Subject: [PATCH 10/22] codegen docs

---
 distributions/dependencies.json               | 164 +++++++++---------
 .../self_hosted_distro/fireworks.md           |   2 +-
 llama_stack/templates/fireworks/run.yaml      |   2 +-
 3 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 6827af1f1..469b6f14e 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -1,24 +1,24 @@
 {
   "together": [
     "scipy",
+    "blobfile",
+    "together",
+    "tqdm",
+    "sentencepiece",
+    "matplotlib",
+    "pandas",
+    "pypdf",
     "scikit-learn",
     "nltk",
-    "chardet",
-    "chromadb-client",
-    "psycopg2-binary",
-    "sentencepiece",
     "faiss-cpu",
-    "blobfile",
-    "pandas",
-    "pillow",
-    "together",
-    "pypdf",
-    "matplotlib",
+    "chardet",
+    "numpy",
+    "psycopg2-binary",
     "aiosqlite",
+    "pillow",
     "redis",
     "transformers",
-    "numpy",
-    "tqdm",
+    "chromadb-client",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu",
     "aiosqlite",
@@ -29,24 +29,24 @@
   ],
   "remote-vllm": [
     "scipy",
+    "blobfile",
+    "tqdm",
+    "sentencepiece",
+    "matplotlib",
+    "pandas",
+    "pypdf",
     "scikit-learn",
     "nltk",
-    "chardet",
-    "chromadb-client",
-    "psycopg2-binary",
-    "sentencepiece",
     "faiss-cpu",
-    "blobfile",
-    "pandas",
-    "pillow",
-    "pypdf",
-    "matplotlib",
+    "chardet",
     "openai",
+    "numpy",
+    "psycopg2-binary",
     "aiosqlite",
+    "pillow",
     "redis",
     "transformers",
-    "numpy",
-    "tqdm",
+    "chromadb-client",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu",
     "aiosqlite",
@@ -57,24 +57,24 @@
   ],
   "fireworks": [
     "scipy",
+    "blobfile",
+    "tqdm",
+    "sentencepiece",
+    "fireworks-ai",
+    "matplotlib",
+    "pandas",
+    "pypdf",
     "scikit-learn",
     "nltk",
-    "chardet",
-    "chromadb-client",
-    "psycopg2-binary",
-    "sentencepiece",
     "faiss-cpu",
-    "blobfile",
-    "pandas",
-    "pillow",
-    "pypdf",
-    "matplotlib",
+    "chardet",
+    "numpy",
+    "psycopg2-binary",
     "aiosqlite",
+    "pillow",
     "redis",
     "transformers",
-    "fireworks-ai",
-    "numpy",
-    "tqdm",
+    "chromadb-client",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu",
     "aiosqlite",
@@ -85,25 +85,25 @@
   ],
   "tgi": [
     "scipy",
-    "scikit-learn",
-    "nltk",
-    "aiohttp",
-    "chardet",
-    "chromadb-client",
-    "psycopg2-binary",
+    "blobfile",
+    "tqdm",
     "huggingface_hub",
     "sentencepiece",
-    "faiss-cpu",
-    "blobfile",
-    "pandas",
-    "pillow",
-    "pypdf",
     "matplotlib",
-    "aiosqlite",
-    "transformers",
-    "redis",
+    "pandas",
+    "pypdf",
+    "scikit-learn",
+    "nltk",
+    "faiss-cpu",
+    "chardet",
     "numpy",
-    "tqdm",
+    "psycopg2-binary",
+    "aiosqlite",
+    "pillow",
+    "redis",
+    "transformers",
+    "chromadb-client",
+    "aiohttp",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu",
     "aiosqlite",
@@ -113,30 +113,30 @@
     "uvicorn"
   ],
   "meta-reference-gpu": [
-    "lm-format-enforcer",
     "scipy",
-    "scikit-learn",
-    "nltk",
-    "accelerate",
-    "chardet",
-    "chromadb-client",
-    "psycopg2-binary",
+    "blobfile",
+    "tqdm",
+    "torchvision",
     "sentencepiece",
     "zmq",
-    "faiss-cpu",
-    "torchvision",
-    "blobfile",
-    "fairscale",
-    "pandas",
-    "pillow",
-    "pypdf",
     "matplotlib",
-    "transformers",
+    "pandas",
+    "pypdf",
+    "scikit-learn",
+    "accelerate",
+    "nltk",
+    "faiss-cpu",
     "torch",
-    "aiosqlite",
-    "redis",
+    "chardet",
     "numpy",
-    "tqdm",
+    "psycopg2-binary",
+    "aiosqlite",
+    "pillow",
+    "redis",
+    "fairscale",
+    "lm-format-enforcer",
+    "transformers",
+    "chromadb-client",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu",
     "aiosqlite",
@@ -147,25 +147,25 @@
   ],
   "ollama": [
     "scipy",
+    "blobfile",
+    "tqdm",
+    "sentencepiece",
+    "matplotlib",
+    "pandas",
+    "pypdf",
     "scikit-learn",
     "nltk",
-    "aiohttp",
     "ollama",
-    "chardet",
-    "chromadb-client",
-    "psycopg2-binary",
-    "sentencepiece",
     "faiss-cpu",
-    "blobfile",
-    "pandas",
-    "pillow",
-    "pypdf",
-    "matplotlib",
-    "aiosqlite",
-    "transformers",
-    "redis",
+    "chardet",
     "numpy",
-    "tqdm",
+    "psycopg2-binary",
+    "aiosqlite",
+    "pillow",
+    "redis",
+    "transformers",
+    "chromadb-client",
+    "aiohttp",
     "sentence-transformers --no-deps",
     "torch --index-url https://download.pytorch.org/whl/cpu",
     "aiosqlite",
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
index f940e6de2..66a150f50 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
@@ -26,7 +26,7 @@ The following models are available by default:
 - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)`
 - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)`
 - `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)`
-- `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-3b-instruct)`
+- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)`
 - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)`
 - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)`
 - `meta-llama/Llama-Guard-3-8B (fireworks/llama-guard-3-8b)`
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index c9c05a8e0..aa44f0f84 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -61,7 +61,7 @@ models:
   provider_id: null
   provider_model_id: fireworks/llama-v3p2-1b-instruct
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct
+  model_id: meta-llama/Llama-3.2-3B-Instruct
   provider_id: null
   provider_model_id: fireworks/llama-v3p2-3b-instruct
 - metadata: {}

From 2da93c883533d49dd070f58b8f3ab5bc019c136c Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 19 Nov 2024 14:20:07 -0800
Subject: [PATCH 11/22] fix 3.2-1b fireworks

---
 distributions/dependencies.json               | 204 +++++++++---------
 .../self_hosted_distro/fireworks.md           |   2 +-
 .../remote/inference/fireworks/fireworks.py   |   2 +-
 llama_stack/templates/fireworks/run.yaml      |   2 +-
 4 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 469b6f14e..0f85b70c6 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -1,26 +1,26 @@
 {
   "together": [
-    "scipy",
-    "blobfile",
-    "together",
-    "tqdm",
-    "sentencepiece",
-    "matplotlib",
-    "pandas",
     "pypdf",
-    "scikit-learn",
-    "nltk",
-    "faiss-cpu",
-    "chardet",
-    "numpy",
-    "psycopg2-binary",
-    "aiosqlite",
-    "pillow",
+    "sentencepiece",
+    "pandas",
     "redis",
-    "transformers",
+    "nltk",
+    "psycopg2-binary",
+    "scikit-learn",
+    "chardet",
+    "matplotlib",
+    "pillow",
+    "tqdm",
     "chromadb-client",
-    "sentence-transformers --no-deps",
+    "transformers",
+    "blobfile",
+    "aiosqlite",
+    "together",
+    "faiss-cpu",
+    "scipy",
+    "numpy",
     "torch --index-url https://download.pytorch.org/whl/cpu",
+    "sentence-transformers --no-deps",
     "aiosqlite",
     "fastapi",
     "fire",
@@ -28,27 +28,27 @@
     "uvicorn"
   ],
   "remote-vllm": [
-    "scipy",
-    "blobfile",
-    "tqdm",
-    "sentencepiece",
-    "matplotlib",
-    "pandas",
     "pypdf",
-    "scikit-learn",
-    "nltk",
-    "faiss-cpu",
-    "chardet",
-    "openai",
-    "numpy",
-    "psycopg2-binary",
-    "aiosqlite",
-    "pillow",
+    "sentencepiece",
+    "pandas",
     "redis",
-    "transformers",
+    "nltk",
+    "psycopg2-binary",
+    "scikit-learn",
+    "chardet",
+    "matplotlib",
+    "pillow",
+    "tqdm",
     "chromadb-client",
-    "sentence-transformers --no-deps",
+    "transformers",
+    "openai",
+    "blobfile",
+    "aiosqlite",
+    "faiss-cpu",
+    "scipy",
+    "numpy",
     "torch --index-url https://download.pytorch.org/whl/cpu",
+    "sentence-transformers --no-deps",
     "aiosqlite",
     "fastapi",
     "fire",
@@ -56,27 +56,27 @@
     "uvicorn"
   ],
   "fireworks": [
-    "scipy",
-    "blobfile",
-    "tqdm",
+    "pypdf",
     "sentencepiece",
+    "pandas",
+    "redis",
+    "nltk",
+    "psycopg2-binary",
+    "scikit-learn",
+    "chardet",
     "fireworks-ai",
     "matplotlib",
-    "pandas",
-    "pypdf",
-    "scikit-learn",
-    "nltk",
-    "faiss-cpu",
-    "chardet",
-    "numpy",
-    "psycopg2-binary",
-    "aiosqlite",
     "pillow",
-    "redis",
-    "transformers",
+    "tqdm",
     "chromadb-client",
-    "sentence-transformers --no-deps",
+    "transformers",
+    "blobfile",
+    "aiosqlite",
+    "faiss-cpu",
+    "scipy",
+    "numpy",
     "torch --index-url https://download.pytorch.org/whl/cpu",
+    "sentence-transformers --no-deps",
     "aiosqlite",
     "fastapi",
     "fire",
@@ -84,28 +84,28 @@
     "uvicorn"
   ],
   "tgi": [
-    "scipy",
-    "blobfile",
+    "pypdf",
+    "sentencepiece",
+    "pandas",
+    "redis",
+    "nltk",
+    "psycopg2-binary",
+    "scikit-learn",
+    "chardet",
+    "matplotlib",
+    "pillow",
     "tqdm",
     "huggingface_hub",
-    "sentencepiece",
-    "matplotlib",
-    "pandas",
-    "pypdf",
-    "scikit-learn",
-    "nltk",
-    "faiss-cpu",
-    "chardet",
-    "numpy",
-    "psycopg2-binary",
-    "aiosqlite",
-    "pillow",
-    "redis",
-    "transformers",
     "chromadb-client",
     "aiohttp",
-    "sentence-transformers --no-deps",
+    "transformers",
+    "blobfile",
+    "aiosqlite",
+    "faiss-cpu",
+    "scipy",
+    "numpy",
     "torch --index-url https://download.pytorch.org/whl/cpu",
+    "sentence-transformers --no-deps",
     "aiosqlite",
     "fastapi",
     "fire",
@@ -113,32 +113,32 @@
     "uvicorn"
   ],
   "meta-reference-gpu": [
-    "scipy",
-    "blobfile",
-    "tqdm",
-    "torchvision",
-    "sentencepiece",
-    "zmq",
-    "matplotlib",
-    "pandas",
     "pypdf",
-    "scikit-learn",
-    "accelerate",
-    "nltk",
-    "faiss-cpu",
+    "sentencepiece",
     "torch",
-    "chardet",
-    "numpy",
-    "psycopg2-binary",
-    "aiosqlite",
-    "pillow",
+    "pandas",
     "redis",
+    "nltk",
+    "psycopg2-binary",
+    "scikit-learn",
+    "chardet",
+    "accelerate",
+    "matplotlib",
+    "pillow",
     "fairscale",
+    "tqdm",
     "lm-format-enforcer",
-    "transformers",
     "chromadb-client",
-    "sentence-transformers --no-deps",
+    "transformers",
+    "blobfile",
+    "aiosqlite",
+    "torchvision",
+    "faiss-cpu",
+    "zmq",
+    "scipy",
+    "numpy",
     "torch --index-url https://download.pytorch.org/whl/cpu",
+    "sentence-transformers --no-deps",
     "aiosqlite",
     "fastapi",
     "fire",
@@ -146,28 +146,28 @@
     "uvicorn"
   ],
   "ollama": [
-    "scipy",
-    "blobfile",
-    "tqdm",
-    "sentencepiece",
-    "matplotlib",
-    "pandas",
-    "pypdf",
-    "scikit-learn",
-    "nltk",
     "ollama",
-    "faiss-cpu",
-    "chardet",
-    "numpy",
-    "psycopg2-binary",
-    "aiosqlite",
-    "pillow",
+    "pypdf",
+    "sentencepiece",
+    "pandas",
     "redis",
-    "transformers",
+    "nltk",
+    "psycopg2-binary",
+    "scikit-learn",
+    "chardet",
+    "matplotlib",
+    "pillow",
+    "tqdm",
     "chromadb-client",
     "aiohttp",
-    "sentence-transformers --no-deps",
+    "transformers",
+    "blobfile",
+    "aiosqlite",
+    "faiss-cpu",
+    "scipy",
+    "numpy",
     "torch --index-url https://download.pytorch.org/whl/cpu",
+    "sentence-transformers --no-deps",
     "aiosqlite",
     "fastapi",
     "fire",
diff --git a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
index 66a150f50..cca1155e1 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
@@ -25,7 +25,7 @@ The following models are available by default:
 - `meta-llama/Llama-3.1-8B-Instruct (fireworks/llama-v3p1-8b-instruct)`
 - `meta-llama/Llama-3.1-70B-Instruct (fireworks/llama-v3p1-70b-instruct)`
 - `meta-llama/Llama-3.1-405B-Instruct-FP8 (fireworks/llama-v3p1-405b-instruct)`
-- `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-1b-instruct)`
+- `meta-llama/Llama-3.2-1B-Instruct (fireworks/llama-v3p2-1b-instruct)`
 - `meta-llama/Llama-3.2-3B-Instruct (fireworks/llama-v3p2-3b-instruct)`
 - `meta-llama/Llama-3.2-11B-Vision-Instruct (fireworks/llama-v3p2-11b-vision-instruct)`
 - `meta-llama/Llama-3.2-90B-Vision-Instruct (fireworks/llama-v3p2-90b-vision-instruct)`
diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py
index d8cbca5f9..c3e634155 100644
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@@ -50,7 +50,7 @@ MODEL_ALIASES = [
     ),
     build_model_alias(
         "fireworks/llama-v3p2-1b-instruct",
-        CoreModelId.llama3_2_3b_instruct.value,
+        CoreModelId.llama3_2_1b_instruct.value,
     ),
     build_model_alias(
         "fireworks/llama-v3p2-3b-instruct",
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index aa44f0f84..6add39c3a 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -57,7 +57,7 @@ models:
   provider_id: null
   provider_model_id: fireworks/llama-v3p1-405b-instruct
 - metadata: {}
-  model_id: meta-llama/Llama-3.2-3B-Instruct
+  model_id: meta-llama/Llama-3.2-1B-Instruct
   provider_id: null
   provider_model_id: fireworks/llama-v3p2-1b-instruct
 - metadata: {}

From 887ccc2143ed922f529eab87cd7bf1e4718e4915 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 15:20:51 -0800
Subject: [PATCH 12/22] Ensure llama-stack-client is installed in the container
 with TEST_PYPI

---
 llama_stack/distribution/build_container.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index 230ca34ac..2730ae174 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -97,7 +97,7 @@ else
     add_to_docker "RUN pip install fastapi libcst"
     add_to_docker <<EOF
 RUN pip install --no-cache --extra-index-url https://test.pypi.org/simple/ \
-  llama-models==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
+  llama-models==$TEST_PYPI_VERSION llama-stack-client==$TEST_PYPI_VERSION llama-stack==$TEST_PYPI_VERSION
 EOF
   else
     add_to_docker "RUN pip install --no-cache llama-stack"

From 7bfcfe80b530bf1e87eb3ca9ce83537e64575209 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 15:50:26 -0800
Subject: [PATCH 13/22] Add logs (prints :/) to dump out what URL vllm / tgi is
 connecting to

---
 llama_stack/providers/remote/inference/tgi/tgi.py   | 1 +
 llama_stack/providers/remote/inference/vllm/vllm.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py
index 30745cb10..92492e3da 100644
--- a/llama_stack/providers/remote/inference/tgi/tgi.py
+++ b/llama_stack/providers/remote/inference/tgi/tgi.py
@@ -264,6 +264,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
 
 class TGIAdapter(_HfAdapter):
     async def initialize(self, config: TGIImplConfig) -> None:
+        print(f"Initializing TGI client with url={config.url}")
         self.client = AsyncInferenceClient(model=config.url, token=config.api_token)
         endpoint_info = await self.client.get_endpoint_info()
         self.max_tokens = endpoint_info["max_total_tokens"]
diff --git a/llama_stack/providers/remote/inference/vllm/vllm.py b/llama_stack/providers/remote/inference/vllm/vllm.py
index 788f6cac4..3c877639c 100644
--- a/llama_stack/providers/remote/inference/vllm/vllm.py
+++ b/llama_stack/providers/remote/inference/vllm/vllm.py
@@ -53,6 +53,7 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
         self.client = None
 
     async def initialize(self) -> None:
+        print(f"Initializing VLLM client with base_url={self.config.url}")
         self.client = OpenAI(base_url=self.config.url, api_key=self.config.api_token)
 
     async def shutdown(self) -> None:

From e605d57fb78285828530b2603d21aaa8593df75d Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 15:59:47 -0800
Subject: [PATCH 14/22] use API version in "remote" stack client

---
 llama_stack/distribution/client.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_stack/distribution/client.py b/llama_stack/distribution/client.py
index b36ef94e4..e1243cb7a 100644
--- a/llama_stack/distribution/client.py
+++ b/llama_stack/distribution/client.py
@@ -15,6 +15,8 @@ import httpx
 from pydantic import BaseModel, parse_obj_as
 from termcolor import cprint
 
+from llama_stack.apis.version import LLAMA_STACK_API_VERSION
+
 from llama_stack.providers.datatypes import RemoteProviderConfig
 
 _CLIENT_CLASSES = {}
@@ -117,7 +119,7 @@ def create_api_client_class(protocol) -> Type:
                     break
                 kwargs[param.name] = args[i]
 
-            url = f"{self.base_url}{webmethod.route}"
+            url = f"{self.base_url}/{LLAMA_STACK_API_VERSION}/{webmethod.route.lstrip('/')}"
 
             def convert(value):
                 if isinstance(value, list):

From f78200b1898e1de19e6ee270bdf7e873ef52fa76 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 19 Nov 2024 16:37:30 -0800
Subject: [PATCH 15/22] docs

---
 .../distributions/self_hosted_distro/index.md |   1 +
 docs/source/getting_started/index.md          | 405 +-----------------
 2 files changed, 9 insertions(+), 397 deletions(-)

diff --git a/docs/source/getting_started/distributions/self_hosted_distro/index.md b/docs/source/getting_started/distributions/self_hosted_distro/index.md
index ed6ab5d7f..502b95cb4 100644
--- a/docs/source/getting_started/distributions/self_hosted_distro/index.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/index.md
@@ -23,5 +23,6 @@ tgi
 dell-tgi
 together
 fireworks
+remote-vllm
 bedrock
 ```
diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index 189bd6cb5..6400fb285 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -53,9 +53,9 @@ Please see our pages in detail for the types of distributions we offer:
 3. [On-device Distribution](./distributions/ondevice_distro/index.md): If you want to run Llama Stack inference on your iOS / Android device.
 
 
-### Quick Start Commands
+### Table of Contents
 
-Once you have decided on the inference provider and distribution to use, use the following quick start commands to get started.
+Once you have decided on the inference provider and distribution to use, use the following guides to get started.
 
 ##### 1.0 Prerequisite
 
@@ -109,421 +109,32 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew
 
 ##### 1.1. Start the distribution
 
-**(Option 1) Via Docker**
-::::{tab-set}
-
 :::{tab-item} meta-reference-gpu
-```
-$ cd llama-stack/distributions/meta-reference-gpu && docker compose up
-```
-
-This will download and start running a pre-built Docker container. Alternatively, you may use the following commands:
-
-```
-docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml
-```
+[Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
 :::
 
 :::{tab-item} vLLM
-```
-$ cd llama-stack/distributions/remote-vllm && docker compose up
-```
-
-The script will first start up vLLM server on port 8000, then start up Llama Stack distribution server hooking up to it for inference. You should see the following outputs --
-```
-<TO BE FILLED>
-```
-
-To kill the server
-```
-docker compose down
-```
+[Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
 :::
 
 :::{tab-item} tgi
-```
-$ cd llama-stack/distributions/tgi && docker compose up
-```
-
-The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should see the following outputs --
-```
-[text-generation-inference] | 2024-10-15T18:56:33.810397Z  INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama)
-[text-generation-inference] | 2024-10-15T18:56:33.810448Z  WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0
-[text-generation-inference] | 2024-10-15T18:56:33.864143Z  INFO text_generation_router::server: router/src/server.rs:2353: Connected
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
-```
-
-To kill the server
-```
-docker compose down
-```
-:::
-
-
-:::{tab-item} ollama
-```
-$ cd llama-stack/distributions/ollama && docker compose up
-
-# OR
-
-$ cd llama-stack/distributions/ollama-gpu && docker compose up
-```
-
-You will see outputs similar to following ---
-```
-[ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
-[ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
-[llamastack] | Resolved 12 providers
-[llamastack] |  inner-inference => ollama0
-[llamastack] |  models => __routing_table__
-[llamastack] |  inference => __autorouted__
-```
-
-To kill the server
-```
-docker compose down
-```
-:::
-
-:::{tab-item} fireworks
-```
-$ cd llama-stack/distributions/fireworks && docker compose up
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
-```
-inference:
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference
-      api_key: <optional api key>
-```
-:::
-
-:::{tab-item} together
-```
-$ cd distributions/together && docker compose up
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
-```
-inference:
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: <optional api key>
-```
-:::
-
-
-::::
-
-**(Option 2) Via Conda**
-
-::::{tab-set}
-
-:::{tab-item} meta-reference-gpu
-1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
-
-2. Build the `meta-reference-gpu` distribution
-
-```
-$ llama stack build --template meta-reference-gpu --image-type conda
-```
-
-3. Start running distribution
-```
-$ llama stack run ~/.llama/distributions/llamastack-meta-reference-gpu/meta-reference-gpu-run.yaml
-```
-
-Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
-```
-memory:
-  - provider_id: faiss-0
-    provider_type: faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/faiss_store.db
-```
-
-:::
-
-:::{tab-item} tgi
-1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
-
-2. Build the `tgi` distribution
-
-```bash
-llama stack build --template tgi --image-type conda
-```
-
-3. Start a TGI server endpoint
-
-4. Make sure in your `run.yaml` file, your `conda_env` is pointing to the conda environment and inference provider is pointing to the correct TGI server endpoint. E.g.
-```
-conda_env: llamastack-tgi
-...
-inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5009
-```
-
-5. Start Llama Stack server
-```bash
-$ llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml
-```
-
-Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
-```
-memory:
-  - provider_id: faiss-0
-    provider_type: faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/faiss_store.db
-```
+[Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
 :::
 
 :::{tab-item} ollama
-
-If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands.
-
-#### Start Ollama server.
-- Please check the [Ollama Documentations](https://github.com/ollama/ollama) for more details.
-
-**Via Docker**
-```
-docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
-```
-
-**Via CLI**
-```
-ollama run <model_id>
-```
-
-#### Start Llama Stack server pointing to Ollama server
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Ollama endpoint. E.g.
-```
-conda_env: llamastack-ollama
-...
-inference:
-  - provider_id: ollama0
-    provider_type: remote::ollama
-    config:
-      url: http://127.0.0.1:11434
-```
-
-```
-llama stack build --template ollama --image-type conda
-llama stack run ~/.llama/distributions/llamastack-ollama/ollama-run.yaml
-```
-
-Note: If you wish to use pgvector or chromadb as memory provider. You may need to update generated `run.yaml` file to point to the desired memory provider. See [Memory Providers](https://llama-stack.readthedocs.io/en/latest/api_providers/memory_api.html) for more details. Or comment out the pgvector or chromadb memory provider in `run.yaml` file to use the default inline memory provider, keeping only the following section:
-```
-memory:
-  - provider_id: faiss-0
-    provider_type: faiss
-    config:
-      kvstore:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/faiss_store.db
-```
-
-:::
-
-:::{tab-item} fireworks
-
-```bash
-llama stack build --template fireworks --image-type conda
-# -- modify run.yaml to a valid Fireworks server endpoint
-llama stack run ./run.yaml
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Fireworks URL server endpoint. E.g.
-```
-conda_env: llamastack-fireworks
-...
-inference:
-  - provider_id: fireworks
-    provider_type: remote::fireworks
-    config:
-      url: https://api.fireworks.ai/inference
-      api_key: <optional api key>
-```
+[Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
 :::
 
 :::{tab-item} together
-
-```bash
-llama stack build --template together --image-type conda
-# -- modify run.yaml to a valid Together server endpoint
-llama stack run ~/.llama/distributions/llamastack-together/together-run.yaml
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the correct Together URL server endpoint. E.g.
-```
-conda_env: llamastack-together
-...
-inference:
-  - provider_id: together
-    provider_type: remote::together
-    config:
-      url: https://api.together.xyz/v1
-      api_key: <optional api key>
-```
-:::
-
-::::
-
-##### 1.2 (Optional) Update Model Serving Configuration
-::::{tab-set}
-
-:::{tab-item} meta-reference-gpu
-You may change the `config.model` in `run.yaml` to update the model currently being served by the distribution. Make sure you have the model checkpoint downloaded in your `~/.llama`.
-```
-inference:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      model: Llama3.2-11B-Vision-Instruct
-      quantization: null
-      torch_seed: null
-      max_seq_len: 4096
-      max_batch_size: 1
-```
-
-Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-:::
-
-:::{tab-item} tgi
-To serve a new model with `tgi`, change the docker command flag `--model-id <model-to-serve>`.
-
-This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve.
-
-```
-command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.2-1B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
-```
-
-or by changing the docker run command's `--model-id` flag
-```
-docker run --rm -it -v $HOME/.cache/huggingface:/data -p 5009:5009 --gpus all ghcr.io/huggingface/text-generation-inference:latest --dtype bfloat16 --usage-stats on --sharded false --model-id meta-llama/Llama-3.2-1B-Instruct --port 5009
-```
-
-Make sure your `run.yaml` file has the inference provider pointing to the TGI server endpoint serving your model.
-```
-inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:5009
-```
-```
-
-Run `llama model list` to see the available models to download, and `llama model download` to download the checkpoints.
-:::
-
-:::{tab-item} ollama
-You can use ollama for managing model downloads.
-
-```
-ollama pull llama3.1:8b-instruct-fp16
-ollama pull llama3.1:70b-instruct-fp16
-```
-
-> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
-
-
-To serve a new model with `ollama`
-```
-ollama run <model_name>
-```
-
-To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama.
-```
-$ ollama ps
-
-NAME                         ID              SIZE     PROCESSOR    UNTIL
-llama3.1:8b-instruct-fp16    4aacac419454    17 GB    100% GPU     4 minutes from now
-```
-
-To verify that the model served by ollama is correctly connected to Llama Stack server
-```
-$ llama-stack-client models list
-+----------------------+----------------------+---------------+-----------------------------------------------+
-| identifier           | llama_model          | provider_id   | metadata                                      |
-+======================+======================+===============+===============================================+
-| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | ollama0       | {'ollama_model': 'llama3.1:8b-instruct-fp16'} |
-+----------------------+----------------------+---------------+-----------------------------------------------+
-```
-:::
-
-:::{tab-item} together
-Use `llama-stack-client models list` to check the available models served by together.
-
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
-+------------------------------+------------------------------+---------------+------------+
-```
+[Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
 :::
 
 :::{tab-item} fireworks
-Use `llama-stack-client models list` to check the available models served by Fireworks.
-```
-$ llama-stack-client models list
-+------------------------------+------------------------------+---------------+------------+
-| identifier                   | llama_model                  | provider_id   | metadata   |
-+==============================+==============================+===============+============+
-| Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-1B-Instruct         | Llama3.2-1B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0    | {}         |
-+------------------------------+------------------------------+---------------+------------+
-```
+[Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
 :::
 
 ::::
 
-
 ##### Troubleshooting
 - If you encounter any issues, search through our [GitHub Issues](https://github.com/meta-llama/llama-stack/issues), or file an new issue.
 - Use `--port <PORT>` flag to use a different port number. For docker run, update the `-p <PORT>:<PORT>` flag.

From c49acc5226b50f51b3756fe66315ab3dd2e847f9 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 19 Nov 2024 16:39:40 -0800
Subject: [PATCH 16/22] docs

---
 docs/source/getting_started/index.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index 6400fb285..bc0258376 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -109,12 +109,13 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew
 
 ##### 1.1. Start the distribution
 
+::::{tab-set}
 :::{tab-item} meta-reference-gpu
-[Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
+- [Start Meta Reference GPU Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/meta-reference-gpu.html)
 :::
 
 :::{tab-item} vLLM
-[Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
+- [Start vLLM Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/remote-vllm.html)
 :::
 
 :::{tab-item} tgi

From b0fdf7552ac5ba5cc3398b4a74b10f53af3677bc Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Tue, 19 Nov 2024 16:41:45 -0800
Subject: [PATCH 17/22] docs

---
 docs/source/getting_started/index.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md
index bc0258376..5fc2c5ed8 100644
--- a/docs/source/getting_started/index.md
+++ b/docs/source/getting_started/index.md
@@ -119,19 +119,19 @@ Access to Single-Node CPU with Fireworks hosted endpoint via API_KEY from [firew
 :::
 
 :::{tab-item} tgi
-[Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
+- [Start TGI Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/tgi.html)
 :::
 
 :::{tab-item} ollama
-[Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
+- [Start Ollama Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/ollama.html)
 :::
 
 :::{tab-item} together
-[Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
+- [Start Together Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/together.html)
 :::
 
 :::{tab-item} fireworks
-[Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
+- [Start Fireworks Distribution](https://llama-stack.readthedocs.io/en/latest/getting_started/distributions/self_hosted_distro/fireworks.html)
 :::
 
 ::::

From dd5466e17d5b384c42f6ed5a2a570fe24a8da71f Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 16:44:15 -0800
Subject: [PATCH 18/22] Bump version to 0.0.53

---
 requirements.txt | 4 ++--
 setup.py         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index dcb30d605..fddf51880 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,8 @@ blobfile
 fire
 httpx
 huggingface-hub
-llama-models>=0.0.50
-llama-stack-client>=0.0.50
+llama-models>=0.0.53
+llama-stack-client>=0.0.53
 prompt-toolkit
 python-dotenv
 pydantic>=2
diff --git a/setup.py b/setup.py
index 3145506f9..13f389a11 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@ def read_requirements():
 
 setup(
     name="llama_stack",
-    version="0.0.50",
+    version="0.0.53",
     author="Meta Llama",
     author_email="llama-oss@meta.com",
     description="Llama Stack",

From e670f99ef7d3e0b3ff1041e4785ad7c7a5db2a99 Mon Sep 17 00:00:00 2001
From: Dinesh Yeduguru <yvdinesh@gmail.com>
Date: Tue, 19 Nov 2024 17:36:08 -0800
Subject: [PATCH 19/22] add changelog (#487)

---
 CHANGELOG.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000..b081678c4
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,35 @@
+# Changelog
+
+## 0.0.53
+
+### Added
+- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
+- Persistence for registered objects with distribution
+- Ability to persist memory banks created for FAISS
+- PostgreSQL KVStore implementation
+- Environment variable placeholder support in run.yaml files
+- Comprehensive Zero-to-Hero notebooks and quickstart guides
+- Support for quantized models in Ollama
+- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
+- Bedrock distribution with safety shields support
+- Evals API with task registration and scoring functions
+- MMLU and SimpleQA benchmark scoring functions
+- Huggingface dataset provider integration for benchmarks
+- Support for custom dataset registration from local paths
+- Benchmark evaluation CLI tools with visualization tables
+- RAG evaluation scoring functions and metrics
+- Local persistence for datasets and eval tasks
+
+### Changed
+- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
+- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
+- Updated API signatures for dataset and eval task registration
+- Restructured folder organization for providers
+- Enhanced Docker build configuration
+- Added version prefixing for REST API routes
+- Enhanced evaluation task registration workflow
+- Improved benchmark evaluation output formatting
+- Restructured evals folder organization for better modularity
+
+### Removed
+- `llama stack configure` command

From 08be0232907d37cf36522df2dd7a0be80ba2d711 Mon Sep 17 00:00:00 2001
From: varunfb <vontimitta@fb.com>
Date: Tue, 19 Nov 2024 17:42:43 -0800
Subject: [PATCH 20/22] Added optional md5 validate command once download is
 completed (#486)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Adds description at the end of successful download the optionally run
the verify md5 checksums command.

## Test Plan
<img width="2004" alt="Screenshot 2024-11-19 at 12 11 37 PM"
src="https://github.com/user-attachments/assets/8d617aef-99f5-4c3b-b93c-eff3e68289ea">

## Before submitting

- [x] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Ran pre-commit to handle lint / formatting issues.
- [x] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [x] Updated relevant documentation.
- [x] Wrote necessary unit or integration tests.

---------

Co-authored-by: varunfb <vontimitta@devgpu004.eag5.facebook.com>
---
 llama_stack/cli/download.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/llama_stack/cli/download.py b/llama_stack/cli/download.py
index bb57186e5..c2f8ac855 100644
--- a/llama_stack/cli/download.py
+++ b/llama_stack/cli/download.py
@@ -380,6 +380,7 @@ def _hf_download(
 
 def _meta_download(
     model: "Model",
+    model_id: str,
     meta_url: str,
     info: "LlamaDownloadInfo",
     max_concurrent_downloads: int,
@@ -405,8 +406,15 @@ def _meta_download(
     downloader = ParallelDownloader(max_concurrent_downloads=max_concurrent_downloads)
     asyncio.run(downloader.download_all(tasks))
 
-    print(f"\nSuccessfully downloaded model to {output_dir}")
-    cprint(f"\nMD5 Checksums are at: {output_dir / 'checklist.chk'}", "white")
+    cprint(f"\nSuccessfully downloaded model to {output_dir}", "green")
+    cprint(
+        f"\nView MD5 checksum files at: {output_dir / 'checklist.chk'}",
+        "white",
+    )
+    cprint(
+        f"\n[Optionally] To run MD5 checksums, use the following command: llama model verify-download --model-id {model_id}",
+        "yellow",
+    )
 
 
 class ModelEntry(BaseModel):
@@ -512,7 +520,7 @@ def run_download_cmd(args: argparse.Namespace, parser: argparse.ArgumentParser):
                 )
                 if "llamameta.net" not in meta_url:
                     parser.error("Invalid Meta URL provided")
-                _meta_download(model, meta_url, info, args.max_parallel)
+                _meta_download(model, model_id, meta_url, info, args.max_parallel)
 
     except Exception as e:
         parser.error(f"Download failed: {str(e)}")

From 1086b500f94828fbe21772619ed022d586fc62fb Mon Sep 17 00:00:00 2001
From: Mengtao Yuan <mengtaoyuan1@gmail.com>
Date: Tue, 19 Nov 2024 20:59:02 -0800
Subject: [PATCH 21/22] Support Tavily as built-in search tool. (#485)

# What does this PR do?

Add Tavily as a built-in search tool, in addition to Brave and Bing.

## Test Plan

It's tested using ollama remote, showing parity to the Brave search
tool.
- Install and run ollama with `ollama run llama3.1:8b-instruct-fp16`
- Build ollama distribution `llama stack build --template ollama
--image-type conda`
- Run ollama `stack run
/$USER/.llama/distributions/llamastack-ollama/ollama-run.yaml --port
5001`
- Client test command: `python - m
agents.test_agents.TestAgents.test_create_agent_turn_with_tavily_search`,
with enviroments:

MASTER_ADDR=0.0.0.0;MASTER_PORT=5001;RANK=0;REMOTE_STACK_HOST=0.0.0.0;REMOTE_STACK_PORT=5001;TAVILY_SEARCH_API_KEY=tvly-<YOUR-KEY>;WORLD_SIZE=1

Test passes on the specific case (ollama remote).

Server output:
```
Listening on ['::', '0.0.0.0']:5001
INFO:     Started server process [7220]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://['::', '0.0.0.0']:5001 (Press CTRL+C to quit)
INFO:     127.0.0.1:65209 - "POST /agents/create HTTP/1.1" 200 OK
INFO:     127.0.0.1:65210 - "POST /agents/session/create HTTP/1.1" 200 OK
INFO:     127.0.0.1:65211 - "POST /agents/turn/create HTTP/1.1" 200 OK
role='user' content='What are the latest developments in quantum computing?' context=None
role='assistant' content='' stop_reason=<StopReason.end_of_turn: 'end_of_turn'> tool_calls=[ToolCall(call_id='fc92ccb8-1039-4ce8-ba5e-8f2b0147661c', tool_name=<BuiltinTool.brave_search: 'brave_search'>, arguments={'query': 'latest developments in quantum computing'})]
role='ipython' call_id='fc92ccb8-1039-4ce8-ba5e-8f2b0147661c' tool_name=<BuiltinTool.brave_search: 'brave_search'> content='{"query": "latest developments in quantum computing", "top_k": [{"title": "IBM Unveils 400 Qubit-Plus Quantum Processor and Next-Generation IBM ...", "url": "https://newsroom.ibm.com/2022-11-09-IBM-Unveils-400-Qubit-Plus-Quantum-Processor-and-Next-Generation-IBM-Quantum-System-Two", "content": "This system is targeted to be online by the end of 2023 and will be a building b...<more>...onnect large-scale ...", "url": "https://news.mit.edu/2023/quantum-interconnects-photon-emission-0105", "content": "Quantum computers hold the promise of performing certain tasks that are intractable even on the world\'s most powerful supercomputers. In the future, scientists anticipate using quantum computing to emulate materials systems, simulate quantum chemistry, and optimize hard tasks, with impacts potentially spanning finance to pharmaceuticals.", "score": 0.71721, "raw_content": null}]}'
Assistant: The latest developments in quantum computing include:

* IBM unveiling its 400 qubit-plus quantum processor and next-generation IBM Quantum System Two, which will be a building block of quantum-centric supercomputing.
* The development of utility-scale quantum computing, which can serve as a scientific tool to explore utility-scale classes of problems in chemistry, physics, and materials beyond brute force classical simulation of quantum mechanics.
* The introduction of advanced hardware across IBM's global fleet of 100+ qubit systems, as well as easy-to-use software that users and computational scientists can now obtain reliable results from quantum systems as they map increasingly larger and more complex problems to quantum circuits.
* Research on quantum repeaters, which use defects in diamond to interconnect quantum systems and could provide the foundation for scalable quantum networking.
* The development of a new source of quantum light, which could be used to improve the efficiency of quantum computers.
* The creation of a new mathematical "blueprint" that is accelerating fusion device development using Dyson maps.
* Research on canceling noise to improve quantum devices, with MIT researchers developing a protocol to extend the life of quantum coherence.
```

Verified with tool response. The final model response is updated with
the search requests.

## Sources

## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Ran pre-commit to handle lint / formatting issues.
- [x] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [x] Updated relevant documentation.
- [x] Wrote necessary unit or integration tests.

Co-authored-by: Martin Yuan <myuan@meta.com>
---
 llama_stack/apis/agents/agents.py             |   1 +
 .../agents/meta_reference/tools/builtin.py    |  18 +++
 .../providers/tests/agents/test_agents.py     | 136 +++++++++++-------
 3 files changed, 106 insertions(+), 49 deletions(-)

diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index f2602ddde..25de35497 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -54,6 +54,7 @@ class ToolDefinitionCommon(BaseModel):
 class SearchEngineType(Enum):
     bing = "bing"
     brave = "brave"
+    tavily = "tavily"
 
 
 @json_schema_type
diff --git a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py
index 4c9cdfcd2..a1e7d08f5 100644
--- a/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py
+++ b/llama_stack/providers/inline/agents/meta_reference/tools/builtin.py
@@ -86,10 +86,13 @@ class PhotogenTool(SingleMessageBuiltinTool):
 class SearchTool(SingleMessageBuiltinTool):
     def __init__(self, engine: SearchEngineType, api_key: str, **kwargs) -> None:
         self.api_key = api_key
+        self.engine_type = engine
         if engine == SearchEngineType.bing:
             self.engine = BingSearch(api_key, **kwargs)
         elif engine == SearchEngineType.brave:
             self.engine = BraveSearch(api_key, **kwargs)
+        elif engine == SearchEngineType.tavily:
+            self.engine = TavilySearch(api_key, **kwargs)
         else:
             raise ValueError(f"Unknown search engine: {engine}")
 
@@ -257,6 +260,21 @@ class BraveSearch:
         return {"query": query, "top_k": clean_response}
 
 
+class TavilySearch:
+    def __init__(self, api_key: str) -> None:
+        self.api_key = api_key
+
+    async def search(self, query: str) -> str:
+        response = requests.post(
+            "https://api.tavily.com/search",
+            json={"api_key": self.api_key, "query": query},
+        )
+        return json.dumps(self._clean_tavily_response(response.json()))
+
+    def _clean_tavily_response(self, search_response, top_k=3):
+        return {"query": search_response["query"], "top_k": search_response["results"]}
+
+
 class WolframAlphaTool(SingleMessageBuiltinTool):
     def __init__(self, api_key: str) -> None:
         self.api_key = api_key
diff --git a/llama_stack/providers/tests/agents/test_agents.py b/llama_stack/providers/tests/agents/test_agents.py
index 60c047058..ee2f3d29f 100644
--- a/llama_stack/providers/tests/agents/test_agents.py
+++ b/llama_stack/providers/tests/agents/test_agents.py
@@ -68,6 +68,73 @@ def query_attachment_messages():
     ]
 
 
+async def create_agent_turn_with_search_tool(
+    agents_stack: Dict[str, object],
+    search_query_messages: List[object],
+    common_params: Dict[str, str],
+    search_tool_definition: SearchToolDefinition,
+) -> None:
+    """
+    Create an agent turn with a search tool.
+
+    Args:
+        agents_stack (Dict[str, object]): The agents stack.
+        search_query_messages (List[object]): The search query messages.
+        common_params (Dict[str, str]): The common parameters.
+        search_tool_definition (SearchToolDefinition): The search tool definition.
+    """
+
+    # Create an agent with the search tool
+    agent_config = AgentConfig(
+        **{
+            **common_params,
+            "tools": [search_tool_definition],
+        }
+    )
+
+    agent_id, session_id = await create_agent_session(
+        agents_stack.impls[Api.agents], agent_config
+    )
+    turn_request = dict(
+        agent_id=agent_id,
+        session_id=session_id,
+        messages=search_query_messages,
+        stream=True,
+    )
+
+    turn_response = [
+        chunk
+        async for chunk in await agents_stack.impls[Api.agents].create_agent_turn(
+            **turn_request
+        )
+    ]
+
+    assert len(turn_response) > 0
+    assert all(
+        isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
+    )
+
+    check_event_types(turn_response)
+
+    # Check for tool execution events
+    tool_execution_events = [
+        chunk
+        for chunk in turn_response
+        if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
+        and chunk.event.payload.step_details.step_type == StepType.tool_execution.value
+    ]
+    assert len(tool_execution_events) > 0, "No tool execution events found"
+
+    # Check the tool execution details
+    tool_execution = tool_execution_events[0].event.payload.step_details
+    assert isinstance(tool_execution, ToolExecutionStep)
+    assert len(tool_execution.tool_calls) > 0
+    assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
+    assert len(tool_execution.tool_responses) > 0
+
+    check_turn_complete_event(turn_response, session_id, search_query_messages)
+
+
 class TestAgents:
     @pytest.mark.asyncio
     async def test_agent_turns_with_safety(
@@ -215,63 +282,34 @@ class TestAgents:
     async def test_create_agent_turn_with_brave_search(
         self, agents_stack, search_query_messages, common_params
     ):
-        agents_impl = agents_stack.impls[Api.agents]
-
         if "BRAVE_SEARCH_API_KEY" not in os.environ:
             pytest.skip("BRAVE_SEARCH_API_KEY not set, skipping test")
 
-        # Create an agent with Brave search tool
-        agent_config = AgentConfig(
-            **{
-                **common_params,
-                "tools": [
-                    SearchToolDefinition(
-                        type=AgentTool.brave_search.value,
-                        api_key=os.environ["BRAVE_SEARCH_API_KEY"],
-                        engine=SearchEngineType.brave,
-                    )
-                ],
-            }
+        search_tool_definition = SearchToolDefinition(
+            type=AgentTool.brave_search.value,
+            api_key=os.environ["BRAVE_SEARCH_API_KEY"],
+            engine=SearchEngineType.brave,
+        )
+        await create_agent_turn_with_search_tool(
+            agents_stack, search_query_messages, common_params, search_tool_definition
         )
 
-        agent_id, session_id = await create_agent_session(agents_impl, agent_config)
-        turn_request = dict(
-            agent_id=agent_id,
-            session_id=session_id,
-            messages=search_query_messages,
-            stream=True,
+    @pytest.mark.asyncio
+    async def test_create_agent_turn_with_tavily_search(
+        self, agents_stack, search_query_messages, common_params
+    ):
+        if "TAVILY_SEARCH_API_KEY" not in os.environ:
+            pytest.skip("TAVILY_SEARCH_API_KEY not set, skipping test")
+
+        search_tool_definition = SearchToolDefinition(
+            type=AgentTool.brave_search.value,  # place holder only
+            api_key=os.environ["TAVILY_SEARCH_API_KEY"],
+            engine=SearchEngineType.tavily,
         )
-
-        turn_response = [
-            chunk async for chunk in await agents_impl.create_agent_turn(**turn_request)
-        ]
-
-        assert len(turn_response) > 0
-        assert all(
-            isinstance(chunk, AgentTurnResponseStreamChunk) for chunk in turn_response
+        await create_agent_turn_with_search_tool(
+            agents_stack, search_query_messages, common_params, search_tool_definition
         )
 
-        check_event_types(turn_response)
-
-        # Check for tool execution events
-        tool_execution_events = [
-            chunk
-            for chunk in turn_response
-            if isinstance(chunk.event.payload, AgentTurnResponseStepCompletePayload)
-            and chunk.event.payload.step_details.step_type
-            == StepType.tool_execution.value
-        ]
-        assert len(tool_execution_events) > 0, "No tool execution events found"
-
-        # Check the tool execution details
-        tool_execution = tool_execution_events[0].event.payload.step_details
-        assert isinstance(tool_execution, ToolExecutionStep)
-        assert len(tool_execution.tool_calls) > 0
-        assert tool_execution.tool_calls[0].tool_name == BuiltinTool.brave_search
-        assert len(tool_execution.tool_responses) > 0
-
-        check_turn_complete_event(turn_response, session_id, search_query_messages)
-
 
 def check_event_types(turn_response):
     event_types = [chunk.event.payload.event_type for chunk in turn_response]

From 89f5093dfcb9acf53ef2507f51137e1e05202952 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 19 Nov 2024 21:05:59 -0800
Subject: [PATCH 22/22] Fix tgi doc

---
 distributions/dependencies.json               | 254 +++++++++---------
 llama_stack/scripts/distro_codegen.py         |   5 +-
 llama_stack/templates/tgi/build.yaml          |   2 +-
 .../templates/tgi/run-with-safety.yaml        |   2 +-
 llama_stack/templates/tgi/run.yaml            |   2 +-
 llama_stack/templates/tgi/tgi.py              |   2 +-
 6 files changed, 132 insertions(+), 135 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 0f85b70c6..92ebd1105 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -1,177 +1,171 @@
 {
   "together": [
-    "pypdf",
-    "sentencepiece",
-    "pandas",
-    "redis",
-    "nltk",
-    "psycopg2-binary",
-    "scikit-learn",
-    "chardet",
-    "matplotlib",
-    "pillow",
-    "tqdm",
-    "chromadb-client",
-    "transformers",
+    "aiosqlite",
     "blobfile",
-    "aiosqlite",
-    "together",
+    "chardet",
+    "chromadb-client",
     "faiss-cpu",
-    "scipy",
-    "numpy",
-    "torch --index-url https://download.pytorch.org/whl/cpu",
-    "sentence-transformers --no-deps",
-    "aiosqlite",
     "fastapi",
     "fire",
     "httpx",
-    "uvicorn"
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "together",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
   "remote-vllm": [
-    "pypdf",
-    "sentencepiece",
-    "pandas",
-    "redis",
-    "nltk",
-    "psycopg2-binary",
-    "scikit-learn",
-    "chardet",
-    "matplotlib",
-    "pillow",
-    "tqdm",
-    "chromadb-client",
-    "transformers",
-    "openai",
+    "aiosqlite",
     "blobfile",
-    "aiosqlite",
+    "chardet",
+    "chromadb-client",
     "faiss-cpu",
-    "scipy",
-    "numpy",
-    "torch --index-url https://download.pytorch.org/whl/cpu",
-    "sentence-transformers --no-deps",
-    "aiosqlite",
     "fastapi",
     "fire",
     "httpx",
-    "uvicorn"
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
   "fireworks": [
-    "pypdf",
-    "sentencepiece",
-    "pandas",
-    "redis",
-    "nltk",
-    "psycopg2-binary",
-    "scikit-learn",
-    "chardet",
-    "fireworks-ai",
-    "matplotlib",
-    "pillow",
-    "tqdm",
-    "chromadb-client",
-    "transformers",
+    "aiosqlite",
     "blobfile",
-    "aiosqlite",
+    "chardet",
+    "chromadb-client",
     "faiss-cpu",
-    "scipy",
-    "numpy",
-    "torch --index-url https://download.pytorch.org/whl/cpu",
-    "sentence-transformers --no-deps",
-    "aiosqlite",
     "fastapi",
     "fire",
+    "fireworks-ai",
     "httpx",
-    "uvicorn"
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
   "tgi": [
-    "pypdf",
-    "sentencepiece",
-    "pandas",
-    "redis",
-    "nltk",
-    "psycopg2-binary",
-    "scikit-learn",
-    "chardet",
-    "matplotlib",
-    "pillow",
-    "tqdm",
-    "huggingface_hub",
-    "chromadb-client",
     "aiohttp",
-    "transformers",
+    "aiosqlite",
     "blobfile",
-    "aiosqlite",
+    "chardet",
+    "chromadb-client",
     "faiss-cpu",
-    "scipy",
-    "numpy",
-    "torch --index-url https://download.pytorch.org/whl/cpu",
-    "sentence-transformers --no-deps",
-    "aiosqlite",
     "fastapi",
     "fire",
     "httpx",
-    "uvicorn"
+    "huggingface_hub",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
   "meta-reference-gpu": [
+    "accelerate",
+    "aiosqlite",
+    "blobfile",
+    "chardet",
+    "chromadb-client",
+    "fairscale",
+    "faiss-cpu",
+    "fastapi",
+    "fire",
+    "httpx",
+    "lm-format-enforcer",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
     "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
     "sentencepiece",
     "torch",
-    "pandas",
-    "redis",
-    "nltk",
-    "psycopg2-binary",
-    "scikit-learn",
-    "chardet",
-    "accelerate",
-    "matplotlib",
-    "pillow",
-    "fairscale",
-    "tqdm",
-    "lm-format-enforcer",
-    "chromadb-client",
-    "transformers",
-    "blobfile",
-    "aiosqlite",
     "torchvision",
-    "faiss-cpu",
+    "tqdm",
+    "transformers",
+    "uvicorn",
     "zmq",
-    "scipy",
-    "numpy",
-    "torch --index-url https://download.pytorch.org/whl/cpu",
     "sentence-transformers --no-deps",
-    "aiosqlite",
-    "fastapi",
-    "fire",
-    "httpx",
-    "uvicorn"
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ],
   "ollama": [
-    "ollama",
-    "pypdf",
-    "sentencepiece",
-    "pandas",
-    "redis",
-    "nltk",
-    "psycopg2-binary",
-    "scikit-learn",
-    "chardet",
-    "matplotlib",
-    "pillow",
-    "tqdm",
-    "chromadb-client",
     "aiohttp",
-    "transformers",
+    "aiosqlite",
     "blobfile",
-    "aiosqlite",
+    "chardet",
+    "chromadb-client",
     "faiss-cpu",
-    "scipy",
-    "numpy",
-    "torch --index-url https://download.pytorch.org/whl/cpu",
-    "sentence-transformers --no-deps",
-    "aiosqlite",
     "fastapi",
     "fire",
     "httpx",
-    "uvicorn"
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "ollama",
+    "pandas",
+    "pillow",
+    "psycopg2-binary",
+    "pypdf",
+    "redis",
+    "scikit-learn",
+    "scipy",
+    "sentencepiece",
+    "tqdm",
+    "transformers",
+    "uvicorn",
+    "sentence-transformers --no-deps",
+    "torch --index-url https://download.pytorch.org/whl/cpu"
   ]
 }
diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py
index 8bcf97374..b82319bd5 100644
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@@ -82,7 +82,10 @@ def collect_template_dependencies(template_dir: Path) -> tuple[str, list[str]]:
             template = template_func()
             normal_deps, special_deps = get_provider_dependencies(template.providers)
             # Combine all dependencies in order: normal deps, special deps, server deps
-            all_deps = normal_deps + special_deps + SERVER_DEPENDENCIES
+            all_deps = sorted(list(set(normal_deps + SERVER_DEPENDENCIES))) + sorted(
+                list(set(special_deps))
+            )
+
             return template.name, all_deps
     except Exception:
         return None, []
diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml
index 5f44c2d86..0f7602e2f 100644
--- a/llama_stack/templates/tgi/build.yaml
+++ b/llama_stack/templates/tgi/build.yaml
@@ -2,7 +2,7 @@ version: '2'
 name: tgi
 distribution_spec:
   description: Use (an external) TGI server for running LLM inference
-  docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+  docker_image: null
   providers:
     inference:
     - remote::tgi
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index b988c28e1..ebf082cd6 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -1,6 +1,6 @@
 version: '2'
 image_name: tgi
-docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+docker_image: null
 conda_env: tgi
 apis:
 - agents
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index 485c02ad8..352afabb5 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -1,6 +1,6 @@
 version: '2'
 image_name: tgi
-docker_image: llamastack/distribution-tgi:test-0.0.52rc3
+docker_image: null
 conda_env: tgi
 apis:
 - agents
diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py
index 79f2ad395..caa341df3 100644
--- a/llama_stack/templates/tgi/tgi.py
+++ b/llama_stack/templates/tgi/tgi.py
@@ -41,7 +41,7 @@ def get_distribution_template() -> DistributionTemplate:
         name="tgi",
         distro_type="self_hosted",
         description="Use (an external) TGI server for running LLM inference",
-        docker_image="llamastack/distribution-tgi:test-0.0.52rc3",
+        docker_image=None,
         template_path=Path(__file__).parent / "doc_template.md",
         providers=providers,
         default_models=[inference_model, safety_model],