Update Fireworks + Togther documentation

2025-12-16 21:22:38 +00:00 · 2024-11-18 12:52:23 -08:00 · 2024-11-18 12:52:23 -08:00 · a562668dcd
commit a562668dcd
parent 1ecaf2cb3c
27 changed files with 879 additions and 445 deletions
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -1,50 +1,91 @@
 version: '2'
-image_name: local
+image_name: fireworks
 docker_image: null
-conda_env: local
+conda_env: null
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
-  - provider_id: fireworks0
+  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference
-      # api_key: <ENTER_YOUR_API_KEY>
+      api_key: ${env.FIREWORKS_API_KEY}
  safety:
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  - provider_id: meta1
    provider_type: inline::prompt-guard
    config:
      model: Prompt-Guard-86M
  memory:
-  - provider_id: meta0
+  - provider_id: faiss
-    provider_type: inline::meta-reference
+    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  # Uncomment to use weaviate memory provider
  # - provider_id: weaviate0
  #   provider_type: remote::weaviate
  #   config: {}
  agents:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
+        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/agents_store.db
  telemetry:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/registry.db
 models:
 - metadata: {}
  model_id: fireworks/llama-v3p1-8b-instruct
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-v3p1-70b-instruct
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-v3p1-405b-instruct
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-v3p2-1b-instruct
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-v3p2-3b-instruct
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-v3p2-11b-vision-instruct
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-v3p2-90b-vision-instruct
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-guard-3-8b
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: fireworks/llama-guard-3-11b-vision
  provider_id: null
  provider_model_id: null
 shields:
 - params: null
  shield_id: meta-llama/Llama-Guard-3-8B
  provider_id: null
  provider_shield_id: null
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: remote-vllm
-docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
+docker_image: null
 conda_env: null
 apis:
 - agents
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1,6 +1,6 @@
 version: '2'
 image_name: remote-vllm
-docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
+docker_image: null
 conda_env: null
 apis:
 - agents
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -1,45 +1,87 @@
 version: '2'
-image_name: local
+image_name: together
 docker_image: null
-conda_env: local
+conda_env: null
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - memory
 - safety
 - telemetry
 providers:
  inference:
-  - provider_id: together0
+  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
-      # api_key: <ENTER_YOUR_API_KEY>
+      api_key: ${env.TOGETHER_API_KEY}
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  - provider_id: meta1
    provider_type: inline::prompt-guard
    config:
      model: Prompt-Guard-86M
  memory:
-  - provider_id: meta0
+  - provider_id: faiss
-    provider_type: remote::weaviate
+    provider_type: inline::faiss
    config:
      kvstore:
        type: sqlite
        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/faiss_store.db
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
    config: {}
  agents:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
+        namespace: null
        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/agents_store.db
  telemetry:
-  - provider_id: meta0
+  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config: {}
 metadata_store:
  namespace: null
  type: sqlite
  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
 models:
 - metadata: {}
  model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: meta-llama/Meta-Llama-Guard-3-8B
  provider_id: null
  provider_model_id: null
 - metadata: {}
  model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
  provider_id: null
  provider_model_id: null
 shields:
 - params: null
  shield_id: meta-llama/Llama-Guard-3-1B
  provider_id: null
  provider_shield_id: null
 memory_banks: []
 datasets: []
 scoring_fns: []
 eval_tasks: []
--- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md
@ -2,63 +2,67 @@
 The `llamastack/distribution-fireworks` distribution consists of the following provider configurations.
 | API | Provider(s) |
 |-----|-------------|
 | agents | `inline::meta-reference` |
 | inference | `remote::fireworks` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | telemetry | `inline::meta-reference` |
 | **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
 |-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
 | **Provider(s)** 	| remote::fireworks   	| meta-reference 	| meta-reference 	| meta-reference 	| meta-reference 	|
-### Step 0. Prerequisite
+### Environment Variables
 - Make sure you have access to a fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/)
-### Step 1. Start the Distribution (Single Node CPU)
+The following environment variables can be configured:
-#### (Option 1) Start Distribution Via Docker
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
-> [!NOTE]
+- `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``)
 > This assumes you have an hosted endpoint at Fireworks with API Key.
-```
+### Models
-$ cd distributions/fireworks && docker compose up
+
 The following models are available by default:
 - `fireworks/llama-v3p1-8b-instruct`
 - `fireworks/llama-v3p1-70b-instruct`
 - `fireworks/llama-v3p1-405b-instruct`
 - `fireworks/llama-v3p2-1b-instruct`
 - `fireworks/llama-v3p2-3b-instruct`
 - `fireworks/llama-v3p2-11b-vision-instruct`
 - `fireworks/llama-v3p2-90b-vision-instruct`
 - `fireworks/llama-guard-3-8b`
 - `fireworks/llama-guard-3-11b-vision`
 ### Prerequisite: API Keys
 Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/).
 ## Running Llama Stack with Fireworks
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-fireworks \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
-Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g.
+### Via Conda
 ```
 inference:
  - provider_id: fireworks
    provider_type: remote::fireworks
    config:
      url: https://api.fireworks.ai/inference
      api_key: <optional api key>
 ```
 #### (Option 2) Start Distribution Via Conda
 ```bash
 llama stack build --template fireworks --image-type conda
-# -- modify run.yaml to a valid Fireworks server endpoint
+llama stack run ./run.yaml \
-llama stack run ./run.yaml
+  --port 5001 \
-```
+  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ### (Optional) Model Serving
 Use `llama-stack-client models list` to check the available models served by Fireworks.
 ```
 $ llama-stack-client models list
 +------------------------------+------------------------------+---------------+------------+
 | identifier                   | llama_model                  | provider_id   | metadata   |
 +==============================+==============================+===============+============+
 | Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-1B-Instruct         | Llama3.2-1B-Instruct         | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0    | {}         |
 +------------------------------+------------------------------+---------------+------------+
 ```
--- a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md
@ -11,90 +11,97 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | telemetry | `inline::meta-reference` |
-You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Models
+You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
-The following models are configured by default:
+## Setting up Ollama server
 - `${env.INFERENCE_MODEL}`
 - `${env.SAFETY_MODEL}`
-## Using Docker Compose
+Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server.
-You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
+In order to load models, you can run:
 ```bash
-$ cd distributions/ollama; docker compose up
+export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
 # ollama names this model differently, and we must use the ollama name when loading the model
 export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
 ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
 ```
-You will see outputs similar to following ---
+If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model.
 ```bash
-[ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
+export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
-[ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
+
-INFO:     Started server process [1]
+# ollama names this model differently, and we must use the ollama name when loading the model
-INFO:     Waiting for application startup.
+export OLLAMA_SAFETY_MODEL="llama-guard3:1b"
-INFO:     Application startup complete.
+ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m
 INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
 [llamastack] | Resolved 12 providers
 [llamastack] |  inner-inference => ollama0
 [llamastack] |  models => __routing_table__
 [llamastack] |  inference => __autorouted__
 ```
-To kill the server
+## Running Llama Stack
 Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
-docker compose down
+LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./run.yaml:/root/my-run.yaml \
  --gpus=all \
  llamastack/distribution-ollama \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
-## Starting Ollama and Llama Stack separately
+If you are using Llama Stack Safety / Shield APIs, use:
 If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
 #### Start Ollama server
 - Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
 **Via Docker**
 ```bash
-docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
  --gpus=all \
  llamastack/distribution-ollama \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
-**Via CLI**
+### Via Conda
 ```bash
 ollama run <model_id>
 ```
-#### Start Llama Stack server pointing to Ollama server
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 **Via Conda**
 ```bash
 llama stack build --template ollama --image-type conda
-llama stack run run.yaml
+llama stack run ./run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://127.0.0.1:11434
 ```
-**Via Docker**
+If you are using Llama Stack Safety / Shield APIs, use:
 ```
 docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
 ```
 Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
 ```yaml
 inference:
  - provider_id: ollama0
    provider_type: remote::ollama
    config:
      url: http://127.0.0.1:14343
 ```
 ### (Optional) Update Model Serving Configuration
 #### Downloading model via Ollama
 You can use ollama for managing model downloads.
 ```bash
-ollama pull llama3.1:8b-instruct-fp16
+llama stack run ./run-with-safety.yaml \
-ollama pull llama3.1:70b-instruct-fp16
+  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env OLLAMA_URL=http://127.0.0.1:11434
 ```
 ### (Optional) Update Model Serving Configuration
 > [!NOTE]
 > Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
--- a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
@ -12,77 +12,106 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
 You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
 ### Models
 The following models are configured by default:
 - `${env.INFERENCE_MODEL}`
 - `${env.SAFETY_MODEL}`
 ## Using Docker Compose
-You can use `docker compose` to start a vLLM container and Llama Stack server container together.
+## Setting up vLLM server
 ```bash
 $ cd distributions/remote-vllm; docker compose up
 ```
-You will see outputs similar to following ---
+Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:
 ```
 <TO BE FILLED>
 ```
 To kill the server
 ```bash
 docker compose down
 ```
 ## Starting vLLM and Llama Stack separately
 You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
 #### Start vLLM server.
 ```bash
-docker run --runtime nvidia --gpus all \
+export INFERENCE_PORT=8000
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CUDA_VISIBLE_DEVICES=0
 docker run \
    --runtime nvidia \
    --gpus $CUDA_VISIBLE_DEVICES \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    -p 8000:8000 \
+    -p $INFERENCE_PORT:$INFERENCE_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
-    --model meta-llama/Llama-3.2-3B-Instruct
+    --model $INFERENCE_MODEL \
    --port $INFERENCE_PORT
 ```
-Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
 #### Start Llama Stack server pointing to your vLLM server
 We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
 ```yaml
 inference:
  - provider_id: vllm0
    provider_type: remote::vllm
    config:
      url: http://127.0.0.1:8000
 ```
 **Via Conda**
 If you are using Conda, you can build and run the Llama Stack server with the following commands:
 ```bash
-cd distributions/remote-vllm
+export SAFETY_PORT=8081
-llama stack build --template remote-vllm --image-type conda
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
-llama stack run run.yaml
+export CUDA_VISIBLE_DEVICES=1
 docker run \
    --runtime nvidia \
    --gpus $CUDA_VISIBLE_DEVICES \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
    -p $SAFETY_PORT:$SAFETY_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model $SAFETY_MODEL \
    --port $SAFETY_PORT
 ```
-**Via Docker**
+## Running Llama Stack
 Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 You can use the Llama Stack Docker image to start the server with the following command:
 ```bash
-docker run --network host -it -p 5000:5000 \
+LLAMA_STACK_PORT=5001
-  -v ~/.llama:/root/.llama \
+docker run \
-  -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
+  -it \
-  --gpus=all \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
-  --yaml_config /root/llamastack-run-remote-vllm.yaml
+  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-remote-vllm \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env VLLM_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
 ```
 ### Via Conda
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 ```bash
 llama stack build --template remote-vllm --image-type conda
 llama stack run ./run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 llama stack run ./run-with-safety.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env VLLM_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md
@ -29,13 +29,13 @@ The following environment variables can be configured:
 Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:
 ```bash
-export TGI_INFERENCE_PORT=8080
+export INFERENCE_PORT=8080
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CUDA_VISIBLE_DEVICES=0
 docker run --rm -it \
  -v $HOME/.cache/huggingface:/data \
-  -p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \
+  -p $INFERENCE_PORT:$INFERENCE_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference:2.3.1 \
  --dtype bfloat16 \
@ -43,29 +43,29 @@ docker run --rm -it \
  --sharded false \
  --cuda-memory-fraction 0.7 \
  --model-id $INFERENCE_MODEL \
-  --port $TGI_INFERENCE_PORT
+  --port $INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
 ```bash
-export TGI_SAFETY_PORT=8081
+export SAFETY_PORT=8081
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1
 docker run --rm -it \
  -v $HOME/.cache/huggingface:/data \
-  -p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \
+  -p $SAFETY_PORT:$SAFETY_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference:2.3.1 \
  --dtype bfloat16 \
  --usage-stats off \
  --sharded false \
  --model-id $SAFETY_MODEL \
-  --port $TGI_SAFETY_PORT
+  --port $SAFETY_PORT
 ```
-## Running Llama Stack with TGI as the inference provider
+## Running Llama Stack
 Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
@ -76,7 +76,6 @@ This method allows you to get started quickly without having to build the distri
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  --network host \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
@ -84,14 +83,13 @@ docker run \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 docker run \
  --network host \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
@ -99,9 +97,9 @@ docker run \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \
+  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT
+  --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
 ```
 ### Via Conda
@ -113,7 +111,7 @@ llama stack build --template tgi --image-type conda
 llama stack run ./run.yaml
  --port 5001
  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -122,7 +120,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 llama stack run ./run-with-safety.yaml
  --port 5001
  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
  --env SAFETY_MODEL=$SAFETY_MODEL
-  --env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT
+  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/docs/source/getting_started/distributions/self_hosted_distro/together.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/together.md
@ -1,62 +1,67 @@
-# Together Distribution
+# Fireworks Distribution
 ### Connect to a Llama Stack Together Endpoint
 - You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
 The `llamastack/distribution-together` distribution consists of the following provider configurations.
-
+| API | Provider(s) |
-| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
+|-----|-------------|
-|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
+| agents | `inline::meta-reference` |
-| **Provider(s)** 	| remote::together   	| meta-reference 	| meta-reference, remote::weaviate 	| meta-reference 	| meta-reference 	|
+| inference | `remote::together` |
 | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 | safety | `inline::llama-guard` |
 | telemetry | `inline::meta-reference` |
-### Docker: Start the Distribution (Single Node CPU)
+### Environment Variables
-> [!NOTE]
+The following environment variables can be configured:
 > This assumes you have an hosted endpoint at Together with API Key.
-```
+- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
-$ cd distributions/together && docker compose up
+- `TOGETHER_API_KEY`: Together.AI API Key (default: ``)
 ### Models
 The following models are available by default:
 - `meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo`
 - `meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo`
 - `meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo`
 - `meta-llama/Llama-3.2-3B-Instruct-Turbo`
 - `meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo`
 - `meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo`
 - `meta-llama/Meta-Llama-Guard-3-8B`
 - `meta-llama/Llama-Guard-3-11B-Vision-Turbo`
 ### Prerequisite: API Keys
 Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/).
 ## Running Llama Stack with Together
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-together \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
-Make sure in your `run.yaml` file, your inference provider is pointing to the correct Together URL server endpoint. E.g.
+### Via Conda
 ```
 inference:
  - provider_id: together
    provider_type: remote::together
    config:
      url: https://api.together.xyz/v1
      api_key: <optional api key>
 ```
 ### Conda llama stack run (Single Node CPU)
 ```bash
 llama stack build --template together --image-type conda
-# -- modify run.yaml to a valid Together server endpoint
+llama stack run ./run.yaml \
-llama stack run ./run.yaml
+  --port 5001 \
-```
+  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ### (Optional) Update Model Serving Configuration
 Use `llama-stack-client models list` to check the available models served by together.
 ```
 $ llama-stack-client models list
 +------------------------------+------------------------------+---------------+------------+
 | identifier                   | llama_model                  | provider_id   | metadata   |
 +==============================+==============================+===============+============+
 | Llama3.1-8B-Instruct         | Llama3.1-8B-Instruct         | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-70B-Instruct        | Llama3.1-70B-Instruct        | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.1-405B-Instruct       | Llama3.1-405B-Instruct       | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-3B-Instruct         | Llama3.2-3B-Instruct         | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 | Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0     | {}         |
 +------------------------------+------------------------------+---------------+------------+
 ```
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Optional
+from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
@ -20,3 +20,10 @@ class FireworksImplConfig(BaseModel):
        default=None,
        description="The Fireworks.ai API Key",
    )
    @classmethod
    def sample_run_config(cls) -> Dict[str, Any]:
        return {
            "url": "https://api.fireworks.ai/inference",
            "api_key": "${env.FIREWORKS_API_KEY}",
        }
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -35,7 +35,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import FireworksImplConfig
-model_aliases = [
+MODEL_ALIASES = [
    build_model_alias(
        "fireworks/llama-v3p1-8b-instruct",
        CoreModelId.llama3_1_8b_instruct.value,
@ -79,7 +79,7 @@ class FireworksInferenceAdapter(
    ModelRegistryHelper, Inference, NeedsRequestProviderData
 ):
    def __init__(self, config: FireworksImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_aliases)
+        ModelRegistryHelper.__init__(self, MODEL_ALIASES)
        self.config = config
        self.formatter = ChatFormat(Tokenizer.get_instance())
--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
-from typing import Optional
+from typing import Any, Dict, Optional
 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
@ -20,3 +20,10 @@ class TogetherImplConfig(BaseModel):
        default=None,
        description="The Together AI API Key",
    )
    @classmethod
    def sample_run_config(cls) -> Dict[str, Any]:
        return {
            "url": "https://api.together.xyz/v1",
            "api_key": "${env.TOGETHER_API_KEY}",
        }
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -38,7 +38,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import TogetherImplConfig
-model_aliases = [
+MODEL_ALIASES = [
    build_model_alias(
        "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        CoreModelId.llama3_1_8b_instruct.value,
@ -78,7 +78,7 @@ class TogetherInferenceAdapter(
    ModelRegistryHelper, Inference, NeedsRequestProviderData
 ):
    def __init__(self, config: TogetherImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_aliases)
+        ModelRegistryHelper.__init__(self, MODEL_ALIASES)
        self.config = config
        self.formatter = ChatFormat(Tokenizer.get_instance())
--- a/llama_stack/templates/fireworks/init.py
+++ b/llama_stack/templates/fireworks/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .fireworks import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@ -1,11 +1,19 @@
 version: '2'
 name: fireworks
 distribution_spec:
-  description: Use Fireworks.ai for running LLM inference
+  description: Use Fireworks.AI for running LLM inference
  docker_image: null
  providers:
-    inference: remote::fireworks
+    inference:
    - remote::fireworks
    memory:
    - inline::faiss
-    - remote::weaviate
+    - remote::chromadb
-    safety: inline::llama-guard
+    - remote::pgvector
-    agents: inline::meta-reference
+    safety:
-    telemetry: inline::meta-reference
+    - inline::llama-guard
    agents:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
 image_type: conda
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@ -0,0 +1,60 @@
 # Fireworks Distribution
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
 {{ providers_table }}
 {% if run_config_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
 {% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}
 {% if default_models %}
 ### Models
 The following models are available by default:
 {% for model in default_models %}
 - `{{ model.model_id }}`
 {% endfor %}
 {% endif %}
 ### Prerequisite: API Keys
 Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/).
 ## Running Llama Stack with Fireworks
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
 ### Via Conda
 ```bash
 llama stack build --template fireworks --image-type conda
 llama stack run ./run.yaml \
  --port 5001 \
  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
 ```
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@ -0,0 +1,60 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from pathlib import Path
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig
 from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["remote::fireworks"],
        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
    inference_provider = Provider(
        provider_id="fireworks",
        provider_type="remote::fireworks",
        config=FireworksImplConfig.sample_run_config(),
    )
    default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES]
    return DistributionTemplate(
        name="fireworks",
        distro_type="self_hosted",
        description="Use Fireworks.AI for running LLM inference",
        docker_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=default_models,
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                },
                default_models=default_models,
                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
            ),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
                "5001",
                "Port for the Llama Stack distribution server",
            ),
            "FIREWORKS_API_KEY": (
                "",
                "Fireworks.AI API Key",
            ),
        },
    )
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -6,103 +6,106 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
-{%- if docker_compose_env_vars %}
+{%- if run_config_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
-{% for var, (default_value, description) in docker_compose_env_vars.items() %}
+{% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}
 {%- if default_models %}
 ### Models
-The following models are configured by default:
+## Setting up Ollama server
 {% for model in default_models %}
 - `{{ model.model_id }}`
 {% endfor %}
 {% endif %}
-## Using Docker Compose
+Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server.
-You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
+In order to load models, you can run:
 ```bash
-$ cd distributions/{{ name }}; docker compose up
+export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
 # ollama names this model differently, and we must use the ollama name when loading the model
 export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
 ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
 ```
-You will see outputs similar to following ---
+If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model.
 ```bash
-[ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
+export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
-[ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
+
-INFO:     Started server process [1]
+# ollama names this model differently, and we must use the ollama name when loading the model
-INFO:     Waiting for application startup.
+export OLLAMA_SAFETY_MODEL="llama-guard3:1b"
-INFO:     Application startup complete.
+ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m
 INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
 [llamastack] | Resolved 12 providers
 [llamastack] |  inner-inference => ollama0
 [llamastack] |  models => __routing_table__
 [llamastack] |  inference => __autorouted__
 ```
-To kill the server
+## Running Llama Stack
 Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
-docker compose down
+LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./run.yaml:/root/my-run.yaml \
  --gpus=all \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
-## Starting Ollama and Llama Stack separately
+If you are using Llama Stack Safety / Shield APIs, use:
 If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
 #### Start Ollama server
 - Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
 **Via Docker**
 ```bash
-docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ~/.llama:/root/.llama \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
  --gpus=all \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env OLLAMA_URL=http://host.docker.internal:11434
 ```
-**Via CLI**
+### Via Conda
 ```bash
 ollama run <model_id>
 ```
-#### Start Llama Stack server pointing to Ollama server
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 **Via Conda**
 ```bash
 llama stack build --template ollama --image-type conda
-llama stack run run.yaml
+llama stack run ./run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env OLLAMA_URL=http://127.0.0.1:11434
 ```
-**Via Docker**
+If you are using Llama Stack Safety / Shield APIs, use:
 ```
 docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
 ```
 Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
 ```yaml
 inference:
  - provider_id: ollama0
    provider_type: remote::ollama
    config:
      url: http://127.0.0.1:14343
 ```
 ### (Optional) Update Model Serving Configuration
 #### Downloading model via Ollama
 You can use ollama for managing model downloads.
 ```bash
-ollama pull llama3.1:8b-instruct-fp16
+llama stack run ./run-with-safety.yaml \
-ollama pull llama3.1:70b-instruct-fp16
+  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env OLLAMA_URL=http://127.0.0.1:11434
 ```
 ### (Optional) Update Model Serving Configuration
 > [!NOTE]
 > Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -68,17 +68,17 @@ def get_distribution_template() -> DistributionTemplate:
                "5001",
                "Port for the Llama Stack distribution server",
            ),
            "OLLAMA_URL": (
                "http://127.0.0.1:11434",
                "URL of the Ollama server",
            ),
            "INFERENCE_MODEL": (
                "meta-llama/Llama-3.2-3B-Instruct",
-                "Inference model loaded into the TGI server",
+                "Inference model loaded into the Ollama server",
            ),
            "OLLAMA_URL": (
                "http://host.docker.internal:11434",
                "URL of the Ollama server",
            ),
            "SAFETY_MODEL": (
                "meta-llama/Llama-Guard-3-1B",
-                "Name of the safety (Llama-Guard) model to use",
+                "Safety model loaded into the Ollama server",
            ),
        },
    )
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@ -2,7 +2,7 @@ version: '2'
 name: remote-vllm
 distribution_spec:
  description: Use (an external) vLLM server for running LLM inference
-  docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
+  docker_image: null
  providers:
    inference:
    - remote::vllm
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -6,90 +6,114 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
 You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
-{%- if docker_compose_env_vars %}
+{% if run_config_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
-{% for var, (default_value, description) in docker_compose_env_vars.items() %}
+{% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}
 {% if default_models %}
 ### Models
-The following models are configured by default:
+## Setting up vLLM server
 {% for model in default_models %}
 - `{{ model.model_id }}`
 {% endfor %}
 {% endif %}
-## Using Docker Compose
+Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:
 You can use `docker compose` to start a vLLM container and Llama Stack server container together.
 ```bash
 $ cd distributions/{{ name }}; docker compose up
 ```
 You will see outputs similar to following ---
 ```
 <TO BE FILLED>
 ```
 To kill the server
 ```bash
 docker compose down
 ```
 ## Starting vLLM and Llama Stack separately
 You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
 #### Start vLLM server.
 ```bash
-docker run --runtime nvidia --gpus all \
+export INFERENCE_PORT=8000
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CUDA_VISIBLE_DEVICES=0
 docker run \
    --runtime nvidia \
    --gpus $CUDA_VISIBLE_DEVICES \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-    -p 8000:8000 \
+    -p $INFERENCE_PORT:$INFERENCE_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
-    --model meta-llama/Llama-3.2-3B-Instruct
+    --model $INFERENCE_MODEL \
    --port $INFERENCE_PORT
 ```
-Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
 ```bash
 export SAFETY_PORT=8081
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1
 docker run \
    --runtime nvidia \
    --gpus $CUDA_VISIBLE_DEVICES \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
    -p $SAFETY_PORT:$SAFETY_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model $SAFETY_MODEL \
    --port $SAFETY_PORT
 ```
 ## Running Llama Stack
 Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \
  --env VLLM_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
 ```
 ### Via Conda
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
 #### Start Llama Stack server pointing to your vLLM server
 We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
 ```yaml
 inference:
  - provider_id: vllm0
    provider_type: remote::vllm
    config:
      url: http://127.0.0.1:8000
 ```
 **Via Conda**
 If you are using Conda, you can build and run the Llama Stack server with the following commands:
 ```bash
 cd distributions/remote-vllm
 llama stack build --template remote-vllm --image-type conda
-llama stack run run.yaml
+llama stack run ./run.yaml \
  --port 5001 \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
-**Via Docker**
+If you are using Llama Stack Safety / Shield APIs, use:
 You can use the Llama Stack Docker image to start the server with the following command:
 ```bash
-docker run --network host -it -p 5000:5000 \
+llama stack run ./run-with-safety.yaml \
-  -v ~/.llama:/root/.llama \
+  --port 5001 \
-  -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --gpus=all \
+  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
-  llamastack/distribution-remote-vllm \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --yaml_config /root/llamastack-run-remote-vllm.yaml
+  --env VLLM_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@ -41,7 +41,6 @@ def get_distribution_template() -> DistributionTemplate:
        name="remote-vllm",
        distro_type="self_hosted",
        description="Use (an external) vLLM server for running LLM inference",
        docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3",
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@ -22,13 +22,13 @@ The following environment variables can be configured:
 Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:
 ```bash
-export TGI_INFERENCE_PORT=8080
+export INFERENCE_PORT=8080
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CUDA_VISIBLE_DEVICES=0
 docker run --rm -it \
  -v $HOME/.cache/huggingface:/data \
-  -p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \
+  -p $INFERENCE_PORT:$INFERENCE_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference:2.3.1 \
  --dtype bfloat16 \
@ -36,29 +36,29 @@ docker run --rm -it \
  --sharded false \
  --cuda-memory-fraction 0.7 \
  --model-id $INFERENCE_MODEL \
-  --port $TGI_INFERENCE_PORT
+  --port $INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
 ```bash
-export TGI_SAFETY_PORT=8081
+export SAFETY_PORT=8081
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1
 docker run --rm -it \
  -v $HOME/.cache/huggingface:/data \
-  -p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \
+  -p $SAFETY_PORT:$SAFETY_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference:2.3.1 \
  --dtype bfloat16 \
  --usage-stats off \
  --sharded false \
  --model-id $SAFETY_MODEL \
-  --port $TGI_SAFETY_PORT
+  --port $SAFETY_PORT
 ```
-## Running Llama Stack with TGI as the inference provider
+## Running Llama Stack
 Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
@ -69,7 +69,6 @@ This method allows you to get started quickly without having to build the distri
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  --network host \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
@ -77,14 +76,13 @@ docker run \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
 ```bash
 docker run \
  --network host \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
@ -92,9 +90,9 @@ docker run \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \
+  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT
+  --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
 ```
 ### Via Conda
@ -106,7 +104,7 @@ llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml
  --port 5001
  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
 If you are using Llama Stack Safety / Shield APIs, use:
@ -115,7 +113,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 llama stack run ./run-with-safety.yaml
  --port 5001
  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
  --env SAFETY_MODEL=$SAFETY_MODEL
-  --env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT
+  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/llama_stack/templates/together/init.py
+++ b/llama_stack/templates/together/init.py
@ -0,0 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from .together import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@ -1,11 +1,19 @@
 version: '2'
 name: together
 distribution_spec:
-  description: Use Together.ai for running LLM inference
+  description: Use Together.AI for running LLM inference
  docker_image: null
  providers:
-    inference: remote::together
+    inference:
    - remote::together
    memory:
    - inline::faiss
-    - remote::weaviate
+    - remote::chromadb
-    safety: inline::llama-guard
+    - remote::pgvector
-    agents: inline::meta-reference
+    safety:
-    telemetry: inline::meta-reference
+    - inline::llama-guard
    agents:
    - inline::meta-reference
    telemetry:
    - inline::meta-reference
 image_type: conda
--- a/llama_stack/templates/together/doc_template.md
+++ b/llama_stack/templates/together/doc_template.md
@ -0,0 +1,60 @@
 # Fireworks Distribution
 The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
 {{ providers_table }}
 {% if run_config_env_vars %}
 ### Environment Variables
 The following environment variables can be configured:
 {% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}
 {% if default_models %}
 ### Models
 The following models are available by default:
 {% for model in default_models %}
 - `{{ model.model_id }}`
 {% endfor %}
 {% endif %}
 ### Prerequisite: API Keys
 Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/).
 ## Running Llama Stack with Together
 You can do this via Conda (build code) or Docker which has a pre-built image.
 ### Via Docker
 This method allows you to get started quickly without having to build the distribution code.
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
  llamastack/distribution-{{ name }} \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
 ### Via Conda
 ```bash
 llama stack build --template together --image-type conda
 llama stack run ./run.yaml \
  --port 5001 \
  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
 ```
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@ -0,0 +1,60 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 from pathlib import Path
 from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
 from llama_stack.providers.remote.inference.together import TogetherImplConfig
 from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES
 from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
 def get_distribution_template() -> DistributionTemplate:
    providers = {
        "inference": ["remote::together"],
        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
        "safety": ["inline::llama-guard"],
        "agents": ["inline::meta-reference"],
        "telemetry": ["inline::meta-reference"],
    }
    inference_provider = Provider(
        provider_id="together",
        provider_type="remote::together",
        config=TogetherImplConfig.sample_run_config(),
    )
    default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES]
    return DistributionTemplate(
        name="together",
        distro_type="self_hosted",
        description="Use Together.AI for running LLM inference",
        docker_image=None,
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=default_models,
        run_configs={
            "run.yaml": RunConfigSettings(
                provider_overrides={
                    "inference": [inference_provider],
                },
                default_models=default_models,
                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-1B")],
            ),
        },
        run_config_env_vars={
            "LLAMASTACK_PORT": (
                "5001",
                "Port for the Llama Stack distribution server",
            ),
            "TOGETHER_API_KEY": (
                "",
                "Together.AI API Key",
            ),
        },
    )