Update Fireworks + Togther documentation

2025-12-17 11:39:47 +00:00 · 2024-11-18 12:52:23 -08:00 · 2024-11-18 12:52:23 -08:00 · a562668dcd
commit a562668dcd
parent 1ecaf2cb3c
27 changed files with 879 additions and 445 deletions
--- a/llama_stack/providers/remote/inference/fireworks/config.py
+++ b/llama_stack/providers/remote/inference/fireworks/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
+from typing import Any, Dict, Optional

 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
@ -20,3 +20,10 @@ class FireworksImplConfig(BaseModel):
        default=None,
        description="The Fireworks.ai API Key",
    )
+
+    @classmethod
+    def sample_run_config(cls) -> Dict[str, Any]:
+        return {
+            "url": "https://api.fireworks.ai/inference",
+            "api_key": "${env.FIREWORKS_API_KEY}",
+        }
--- a/llama_stack/providers/remote/inference/fireworks/fireworks.py
+++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py
@ -35,7 +35,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import FireworksImplConfig


-model_aliases = [
+MODEL_ALIASES = [
    build_model_alias(
        "fireworks/llama-v3p1-8b-instruct",
        CoreModelId.llama3_1_8b_instruct.value,
@ -79,7 +79,7 @@ class FireworksInferenceAdapter(
    ModelRegistryHelper, Inference, NeedsRequestProviderData
 ):
    def __init__(self, config: FireworksImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_aliases)
+        ModelRegistryHelper.__init__(self, MODEL_ALIASES)
        self.config = config
        self.formatter = ChatFormat(Tokenizer.get_instance())

--- a/llama_stack/providers/remote/inference/together/config.py
+++ b/llama_stack/providers/remote/inference/together/config.py
@ -4,7 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.

-from typing import Optional
+from typing import Any, Dict, Optional

 from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
@ -20,3 +20,10 @@ class TogetherImplConfig(BaseModel):
        default=None,
        description="The Together AI API Key",
    )
+
+    @classmethod
+    def sample_run_config(cls) -> Dict[str, Any]:
+        return {
+            "url": "https://api.together.xyz/v1",
+            "api_key": "${env.TOGETHER_API_KEY}",
+        }
--- a/llama_stack/providers/remote/inference/together/together.py
+++ b/llama_stack/providers/remote/inference/together/together.py
@ -38,7 +38,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
 from .config import TogetherImplConfig


-model_aliases = [
+MODEL_ALIASES = [
    build_model_alias(
        "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        CoreModelId.llama3_1_8b_instruct.value,
@ -78,7 +78,7 @@ class TogetherInferenceAdapter(
    ModelRegistryHelper, Inference, NeedsRequestProviderData
 ):
    def __init__(self, config: TogetherImplConfig) -> None:
-        ModelRegistryHelper.__init__(self, model_aliases)
+        ModelRegistryHelper.__init__(self, MODEL_ALIASES)
        self.config = config
        self.formatter = ChatFormat(Tokenizer.get_instance())

--- a/llama_stack/templates/fireworks/init.py
+++ b/llama_stack/templates/fireworks/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .fireworks import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/fireworks/build.yaml
+++ b/llama_stack/templates/fireworks/build.yaml
@ -1,11 +1,19 @@
+version: '2'
 name: fireworks
 distribution_spec:
-  description: Use Fireworks.ai for running LLM inference
+  description: Use Fireworks.AI for running LLM inference
+  docker_image: null
  providers:
-    inference: remote::fireworks
+    inference:
+    - remote::fireworks
    memory:
    - inline::faiss
-    - remote::weaviate
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/fireworks/doc_template.md
+++ b/llama_stack/templates/fireworks/doc_template.md
@ -0,0 +1,60 @@
+# Fireworks Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }}`
+{% endfor %}
+{% endif %}
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/).
+
+
+## Running Llama Stack with Fireworks
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-{{ name }} \
+  /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
+```
+
+### Via Conda
+
+```bash
+llama stack build --template fireworks --image-type conda
+llama stack run ./run.yaml \
+  --port 5001 \
+  --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
+```
--- a/llama_stack/templates/fireworks/fireworks.py
+++ b/llama_stack/templates/fireworks/fireworks.py
@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig
+from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES
+
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::fireworks"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="fireworks",
+        provider_type="remote::fireworks",
+        config=FireworksImplConfig.sample_run_config(),
+    )
+
+    default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES]
+
+    return DistributionTemplate(
+        name="fireworks",
+        distro_type="self_hosted",
+        description="Use Fireworks.AI for running LLM inference",
+        docker_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=default_models,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=default_models,
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "FIREWORKS_API_KEY": (
+                "",
+                "Fireworks.AI API Key",
+            ),
+        },
+    )
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@ -6,103 +6,106 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following

 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.

-{%- if docker_compose_env_vars %}
+{%- if run_config_env_vars %}
 ### Environment Variables

 The following environment variables can be configured:

-{% for var, (default_value, description) in docker_compose_env_vars.items() %}
+{% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}

-{%- if default_models %}
-### Models

-The following models are configured by default:
-{% for model in default_models %}
- `{{ model.model_id }}`
-{% endfor %}
-{% endif %}
+## Setting up Ollama server

-## Using Docker Compose
+Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server.

-You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
+In order to load models, you can run:

 ```bash
-$ cd distributions/{{ name }}; docker compose up
+export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
+
+# ollama names this model differently, and we must use the ollama name when loading the model
+export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
+ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
 ```

-You will see outputs similar to following ---
+If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model.
+
 ```bash
-[ollama]               | [GIN] 2024/10/18 - 21:19:41 | 200 |     226.841µs |             ::1 | GET      "/api/ps"
-[ollama]               | [GIN] 2024/10/18 - 21:19:42 | 200 |      60.908µs |             ::1 | GET      "/api/ps"
-INFO:     Started server process [1]
-INFO:     Waiting for application startup.
-INFO:     Application startup complete.
-INFO:     Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
-[llamastack] | Resolved 12 providers
-[llamastack] |  inner-inference => ollama0
-[llamastack] |  models => __routing_table__
-[llamastack] |  inference => __autorouted__
+export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
+
+# ollama names this model differently, and we must use the ollama name when loading the model
+export OLLAMA_SAFETY_MODEL="llama-guard3:1b"
+ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m
 ```

-To kill the server
+## Running Llama Stack
+
+Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
 ```bash
-docker compose down
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v ./run.yaml:/root/my-run.yaml \
+  --gpus=all \
+  llamastack/distribution-{{ name }} \
+  /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://host.docker.internal:11434
 ```

-## Starting Ollama and Llama Stack separately
+If you are using Llama Stack Safety / Shield APIs, use:

-If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
-
-#### Start Ollama server
- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
-
-**Via Docker**
 ```bash
-docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v ./run-with-safety.yaml:/root/my-run.yaml \
+  --gpus=all \
+  llamastack/distribution-{{ name }} \
+  /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env OLLAMA_URL=http://host.docker.internal:11434
 ```

-**Via CLI**
-```bash
-ollama run <model_id>
-```
+### Via Conda

-#### Start Llama Stack server pointing to Ollama server
-
-**Via Conda**
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
 llama stack build --template ollama --image-type conda
-llama stack run run.yaml
+llama stack run ./run.yaml \
+  --port 5001 \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env OLLAMA_URL=http://127.0.0.1:11434
 ```

-**Via Docker**
-```
-docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
-```
-
-Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
-```yaml
-inference:
-  - provider_id: ollama0
-    provider_type: remote::ollama
-    config:
-      url: http://127.0.0.1:14343
-```
-
-### (Optional) Update Model Serving Configuration
-
-#### Downloading model via Ollama
-
-You can use ollama for managing model downloads.
+If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
-ollama pull llama3.1:8b-instruct-fp16
-ollama pull llama3.1:70b-instruct-fp16
+llama stack run ./run-with-safety.yaml \
+  --port 5001 \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env OLLAMA_URL=http://127.0.0.1:11434
 ```

+
+### (Optional) Update Model Serving Configuration
+
 > [!NOTE]
 > Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.

--- a/llama_stack/templates/ollama/ollama.py
+++ b/llama_stack/templates/ollama/ollama.py
@ -68,17 +68,17 @@ def get_distribution_template() -> DistributionTemplate:
                "5001",
                "Port for the Llama Stack distribution server",
            ),
+            "OLLAMA_URL": (
+                "http://127.0.0.1:11434",
+                "URL of the Ollama server",
+            ),
            "INFERENCE_MODEL": (
                "meta-llama/Llama-3.2-3B-Instruct",
-                "Inference model loaded into the TGI server",
-            ),
-            "OLLAMA_URL": (
-                "http://host.docker.internal:11434",
-                "URL of the Ollama server",
+                "Inference model loaded into the Ollama server",
            ),
            "SAFETY_MODEL": (
                "meta-llama/Llama-Guard-3-1B",
-                "Name of the safety (Llama-Guard) model to use",
+                "Safety model loaded into the Ollama server",
            ),
        },
    )
--- a/llama_stack/templates/remote-vllm/build.yaml
+++ b/llama_stack/templates/remote-vllm/build.yaml
@ -2,7 +2,7 @@ version: '2'
 name: remote-vllm
 distribution_spec:
  description: Use (an external) vLLM server for running LLM inference
-  docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
+  docker_image: null
  providers:
    inference:
    - remote::vllm
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -6,90 +6,114 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following

 You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.

-{%- if docker_compose_env_vars %}
+{% if run_config_env_vars %}
 ### Environment Variables

 The following environment variables can be configured:

-{% for var, (default_value, description) in docker_compose_env_vars.items() %}
+{% for var, (default_value, description) in run_config_env_vars.items() %}
 - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
 {% endfor %}
 {% endif %}

-{% if default_models %}
-### Models

-The following models are configured by default:
-{% for model in default_models %}
- `{{ model.model_id }}`
-{% endfor %}
-{% endif %}
+## Setting up vLLM server

-## Using Docker Compose
-
-You can use `docker compose` to start a vLLM container and Llama Stack server container together.
-```bash
-$ cd distributions/{{ name }}; docker compose up
-```
-
-You will see outputs similar to following ---
-```
-<TO BE FILLED>
-```
-
-To kill the server
-```bash
-docker compose down
-```
-
-## Starting vLLM and Llama Stack separately
-
-You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
-
-#### Start vLLM server.
+Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:

 ```bash
-docker run --runtime nvidia --gpus all \
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export CUDA_VISIBLE_DEVICES=0
+
+docker run \
+    --runtime nvidia \
+    --gpus $CUDA_VISIBLE_DEVICES \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-    -p 8000:8000 \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    -p $INFERENCE_PORT:$INFERENCE_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
-    --model meta-llama/Llama-3.2-3B-Instruct
+    --model $INFERENCE_MODEL \
+    --port $INFERENCE_PORT
 ```

-Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+
+```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export CUDA_VISIBLE_DEVICES=1
+
+docker run \
+    --runtime nvidia \
+    --gpus $CUDA_VISIBLE_DEVICES \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+    -p $SAFETY_PORT:$SAFETY_PORT \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model $SAFETY_MODEL \
+    --port $SAFETY_PORT
+```
+
+## Running Llama Stack
+
+Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-{{ name }} \
+  /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
+```
+
+If you are using Llama Stack Safety / Shield APIs, use:
+
+```bash
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run-with-safety.yaml:/root/my-run.yaml \
+  llamastack/distribution-{{ name }} \
+  /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env VLLM_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
+```
+
+
+### Via Conda
+
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

-
-#### Start Llama Stack server pointing to your vLLM server
-
-
-We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
-```yaml
-inference:
-  - provider_id: vllm0
-    provider_type: remote::vllm
-    config:
-      url: http://127.0.0.1:8000
-```
-
-**Via Conda**
-
-If you are using Conda, you can build and run the Llama Stack server with the following commands:
 ```bash
-cd distributions/remote-vllm
 llama stack build --template remote-vllm --image-type conda
-llama stack run run.yaml
+llama stack run ./run.yaml \
+  --port 5001 \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
 ```

-**Via Docker**
+If you are using Llama Stack Safety / Shield APIs, use:

-You can use the Llama Stack Docker image to start the server with the following command:
 ```bash
-docker run --network host -it -p 5000:5000 \
-  -v ~/.llama:/root/.llama \
-  -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
-  --gpus=all \
-  llamastack/distribution-remote-vllm \
-  --yaml_config /root/llamastack-run-remote-vllm.yaml
+llama stack run ./run-with-safety.yaml \
+  --port 5001 \
+  --env INFERENCE_MODEL=$INFERENCE_MODEL \
+  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
+  --env SAFETY_MODEL=$SAFETY_MODEL \
+  --env VLLM_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/llama_stack/templates/remote-vllm/vllm.py
+++ b/llama_stack/templates/remote-vllm/vllm.py
@ -41,7 +41,6 @@ def get_distribution_template() -> DistributionTemplate:
        name="remote-vllm",
        distro_type="self_hosted",
        description="Use (an external) vLLM server for running LLM inference",
-        docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3",
        template_path=Path(__file__).parent / "doc_template.md",
        providers=providers,
        default_models=[inference_model, safety_model],
--- a/llama_stack/templates/tgi/doc_template.md
+++ b/llama_stack/templates/tgi/doc_template.md
@ -22,13 +22,13 @@ The following environment variables can be configured:
 Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:

 ```bash
-export TGI_INFERENCE_PORT=8080
+export INFERENCE_PORT=8080
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export CUDA_VISIBLE_DEVICES=0

 docker run --rm -it \
  -v $HOME/.cache/huggingface:/data \
-  -p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \
+  -p $INFERENCE_PORT:$INFERENCE_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference:2.3.1 \
  --dtype bfloat16 \
@ -36,29 +36,29 @@ docker run --rm -it \
  --sharded false \
  --cuda-memory-fraction 0.7 \
  --model-id $INFERENCE_MODEL \
-  --port $TGI_INFERENCE_PORT
+  --port $INFERENCE_PORT
 ```

 If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:

 ```bash
-export TGI_SAFETY_PORT=8081
+export SAFETY_PORT=8081
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 export CUDA_VISIBLE_DEVICES=1

 docker run --rm -it \
  -v $HOME/.cache/huggingface:/data \
-  -p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \
+  -p $SAFETY_PORT:$SAFETY_PORT \
  --gpus $CUDA_VISIBLE_DEVICES \
  ghcr.io/huggingface/text-generation-inference:2.3.1 \
  --dtype bfloat16 \
  --usage-stats off \
  --sharded false \
  --model-id $SAFETY_MODEL \
-  --port $TGI_SAFETY_PORT
+  --port $SAFETY_PORT
 ```

-## Running Llama Stack with TGI as the inference provider
+## Running Llama Stack

 Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.

@ -69,7 +69,6 @@ This method allows you to get started quickly without having to build the distri
 ```bash
 LLAMA_STACK_PORT=5001
 docker run \
-  --network host \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run.yaml:/root/my-run.yaml \
@ -77,14 +76,13 @@ docker run \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
 ```

 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
 docker run \
-  --network host \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
  -v ./run-with-safety.yaml:/root/my-run.yaml \
@ -92,9 +90,9 @@ docker run \
  /root/my-run.yaml \
  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
-  --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \
+  --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \
-  --env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT
+  --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
 ```

 ### Via Conda
@ -106,7 +104,7 @@ llama stack build --template {{ name }} --image-type conda
 llama stack run ./run.yaml
  --port 5001
  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
 ```

 If you are using Llama Stack Safety / Shield APIs, use:
@ -115,7 +113,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
 llama stack run ./run-with-safety.yaml
  --port 5001
  --env INFERENCE_MODEL=$INFERENCE_MODEL
-  --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
+  --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
  --env SAFETY_MODEL=$SAFETY_MODEL
-  --env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT
+  --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
 ```
--- a/llama_stack/templates/together/init.py
+++ b/llama_stack/templates/together/init.py
@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .together import get_distribution_template  # noqa: F401
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@ -1,11 +1,19 @@
+version: '2'
 name: together
 distribution_spec:
-  description: Use Together.ai for running LLM inference
+  description: Use Together.AI for running LLM inference
+  docker_image: null
  providers:
-    inference: remote::together
+    inference:
+    - remote::together
    memory:
    - inline::faiss
-    - remote::weaviate
-    safety: inline::llama-guard
-    agents: inline::meta-reference
-    telemetry: inline::meta-reference
+    - remote::chromadb
+    - remote::pgvector
+    safety:
+    - inline::llama-guard
+    agents:
+    - inline::meta-reference
+    telemetry:
+    - inline::meta-reference
+image_type: conda
--- a/llama_stack/templates/together/doc_template.md
+++ b/llama_stack/templates/together/doc_template.md
@ -0,0 +1,60 @@
+# Fireworks Distribution
+
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+
+{{ providers_table }}
+
+{% if run_config_env_vars %}
+### Environment Variables
+
+The following environment variables can be configured:
+
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+
+{% if default_models %}
+### Models
+
+The following models are available by default:
+
+{% for model in default_models %}
+- `{{ model.model_id }}`
+{% endfor %}
+{% endif %}
+
+
+### Prerequisite: API Keys
+
+Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/).
+
+
+## Running Llama Stack with Together
+
+You can do this via Conda (build code) or Docker which has a pre-built image.
+
+### Via Docker
+
+This method allows you to get started quickly without having to build the distribution code.
+
+```bash
+LLAMA_STACK_PORT=5001
+docker run \
+  -it \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ./run.yaml:/root/my-run.yaml \
+  llamastack/distribution-{{ name }} \
+  /root/my-run.yaml \
+  --port $LLAMA_STACK_PORT \
+  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
+```
+
+### Via Conda
+
+```bash
+llama stack build --template together --image-type conda
+llama stack run ./run.yaml \
+  --port 5001 \
+  --env TOGETHER_API_KEY=$TOGETHER_API_KEY
+```
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pathlib import Path
+
+from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
+from llama_stack.providers.remote.inference.together import TogetherImplConfig
+from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES
+
+from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
+
+
+def get_distribution_template() -> DistributionTemplate:
+    providers = {
+        "inference": ["remote::together"],
+        "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
+        "safety": ["inline::llama-guard"],
+        "agents": ["inline::meta-reference"],
+        "telemetry": ["inline::meta-reference"],
+    }
+
+    inference_provider = Provider(
+        provider_id="together",
+        provider_type="remote::together",
+        config=TogetherImplConfig.sample_run_config(),
+    )
+
+    default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES]
+
+    return DistributionTemplate(
+        name="together",
+        distro_type="self_hosted",
+        description="Use Together.AI for running LLM inference",
+        docker_image=None,
+        template_path=Path(__file__).parent / "doc_template.md",
+        providers=providers,
+        default_models=default_models,
+        run_configs={
+            "run.yaml": RunConfigSettings(
+                provider_overrides={
+                    "inference": [inference_provider],
+                },
+                default_models=default_models,
+                default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-1B")],
+            ),
+        },
+        run_config_env_vars={
+            "LLAMASTACK_PORT": (
+                "5001",
+                "Port for the Llama Stack distribution server",
+            ),
+            "TOGETHER_API_KEY": (
+                "",
+                "Together.AI API Key",
+            ),
+        },
+    )