From 4d2bd2d39ed8bf85bb20ed5af52090d300ecb5e0 Mon Sep 17 00:00:00 2001
From: Xi Yan <xiyan@meta.com>
Date: Mon, 21 Oct 2024 18:15:08 -0700
Subject: [PATCH] add more distro templates (#279)

* verify dockers

* together distro verified

* readme

* fireworks distro

* fireworks compose up

* fireworks verified
---
 distributions/README.md                     |  2 +
 distributions/fireworks/README.md           | 55 +++++++++++++++++
 distributions/fireworks/build.yaml          |  2 +-
 distributions/fireworks/compose.yaml        | 18 ++++++
 distributions/fireworks/run.yaml            | 46 ++++++++++++++
 distributions/meta-reference-gpu/README.md  | 11 +---
 distributions/meta-reference-gpu/build.yaml |  2 +-
 distributions/ollama/README.md              |  4 +-
 distributions/ollama/build.yaml             |  4 +-
 distributions/ollama/gpu/compose.yaml       |  2 +-
 distributions/tgi/build.yaml                |  4 +-
 distributions/tgi/cpu/compose.yaml          | 21 -------
 distributions/together/README.md            | 68 +++++++++++++++++++++
 distributions/together/build.yaml           |  4 +-
 distributions/together/compose.yaml         | 18 ++++++
 distributions/together/run.yaml             | 42 +++++++++++++
 llama_stack/distribution/build_container.sh |  2 +-
 llama_stack/providers/registry/inference.py |  2 +-
 18 files changed, 265 insertions(+), 42 deletions(-)
 create mode 100644 distributions/fireworks/README.md
 create mode 100644 distributions/fireworks/compose.yaml
 create mode 100644 distributions/fireworks/run.yaml
 create mode 100644 distributions/together/README.md
 create mode 100644 distributions/together/compose.yaml
 create mode 100644 distributions/together/run.yaml

diff --git a/distributions/README.md b/distributions/README.md
index 92640210b..1802f0c9d 100644
--- a/distributions/README.md
+++ b/distributions/README.md
@@ -9,3 +9,5 @@ A Distribution is where APIs and Providers are assembled together to provide a c
 |  Meta Reference  	| llamastack/distribution-meta-reference-gpu 	|       [Guide](./meta-reference-gpu/)       	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	|
 |      Ollama      	|       llamastack/distribution-ollama       	|       [Guide](./ollama/)       	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	|
 |        TGI       	|         llamastack/distribution-tgi        	|       [Guide](./tgi/)       	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	|
+|        Together       	|         llamastack/distribution-together        	|       [Guide](./together/)       	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	|
+|        Fireworks       	|         llamastack/distribution-fireworks        	|       [Guide](./fireworks/)       	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	| :heavy_check_mark: 	|
diff --git a/distributions/fireworks/README.md b/distributions/fireworks/README.md
new file mode 100644
index 000000000..fcf74d809
--- /dev/null
+++ b/distributions/fireworks/README.md
@@ -0,0 +1,55 @@
+# Fireworks Distribution
+
+The `llamastack/distribution-` distribution consists of the following provider configurations.
+
+
+| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
+|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
+| **Provider(s)** 	| remote::fireworks   	| meta-reference 	| meta-reference 	| meta-reference 	| meta-reference 	|
+
+
+### Start the Distribution (Single Node CPU)
+
+> [!NOTE]
+> This assumes you have an hosted endpoint at Fireworks with API Key.
+
+```
+$ cd llama-stack/distribution/fireworks
+$ ls
+compose.yaml  run.yaml
+$ docker compose up
+```
+
+Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g.
+```
+inference:
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inferenc
+      api_key: <optional api key>
+```
+
+### (Alternative) TGI server + llama stack run (Single Node GPU)
+
+```
+docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-fireworks --yaml_config /root/my-run.yaml
+```
+
+Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g.
+```
+inference:
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference
+      api_key: <optional api key>
+```
+
+**Via Conda**
+
+```bash
+llama stack build --config ./build.yaml
+# -- modify run.yaml to a valid Fireworks server endpoint
+llama stack run ./run.yaml
+```
diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml
index 831643ff1..2e5cf0753 100644
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@@ -7,4 +7,4 @@ distribution_spec:
     safety: meta-reference
     agents: meta-reference
     telemetry: meta-reference
-image_type: conda
+image_type: docker
diff --git a/distributions/fireworks/compose.yaml b/distributions/fireworks/compose.yaml
new file mode 100644
index 000000000..552806745
--- /dev/null
+++ b/distributions/fireworks/compose.yaml
@@ -0,0 +1,18 @@
+services:
+  llamastack:
+    image: llamastack/distribution-fireworks
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      # Link to ollama run.yaml file
+      - ./run.yaml:/root/llamastack-run-fireworks.yaml
+    ports:
+      - "5000:5000"
+    # Hack: wait for ollama server to start before starting docker
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
diff --git a/distributions/fireworks/run.yaml b/distributions/fireworks/run.yaml
new file mode 100644
index 000000000..c48b0cb7b
--- /dev/null
+++ b/distributions/fireworks/run.yaml
@@ -0,0 +1,46 @@
+version: '2'
+built_at: '2024-10-08T17:40:45.325529'
+image_name: local
+docker_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: fireworks0
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference
+  safety:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config:
+      llama_guard_shield:
+        model: Llama-Guard-3-1B
+        excluded_categories: []
+        disable_input_check: false
+        disable_output_check: false
+      prompt_guard_shield:
+        model: Prompt-Guard-86M
+  memory:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config: {}
+  agents:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/kvstore.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config: {}
diff --git a/distributions/meta-reference-gpu/README.md b/distributions/meta-reference-gpu/README.md
index 951120da5..7f209c4a9 100644
--- a/distributions/meta-reference-gpu/README.md
+++ b/distributions/meta-reference-gpu/README.md
@@ -11,13 +11,8 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo
 ### Start the Distribution (Single Node GPU)
 
 > [!NOTE]
-> This assumes you have access to GPU to start a TGI server with access to your GPU.
+> This assumes you have access to GPU to start a local server with access to your GPU.
 
-> [!NOTE]
-> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.
-```
-export LLAMA_CHECKPOINT_DIR=~/.llama
-```
 
 > [!NOTE]
 > `~/.llama` should be the path containing downloaded weights of Llama models.
@@ -26,8 +21,8 @@ export LLAMA_CHECKPOINT_DIR=~/.llama
 To download and start running a pre-built docker container, you may use the following commands:
 
 ```
-docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack/llamastack-local-gpu
+docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml
 ```
 
 ### Alternative (Build and start distribution locally via conda)
-- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on starting up a meta-reference distribution.
+- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on building locally via conda and starting up a meta-reference distribution.
diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml
index ca786c51c..e76197330 100644
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@@ -1,4 +1,4 @@
-name: distribution-meta-reference-gpu
+name: meta-reference-gpu
 distribution_spec:
   description: Use code from `llama_stack` itself to serve all llama stack APIs
   providers:
diff --git a/distributions/ollama/README.md b/distributions/ollama/README.md
index 43c764cbe..d59c3f9e1 100644
--- a/distributions/ollama/README.md
+++ b/distributions/ollama/README.md
@@ -71,10 +71,10 @@ ollama run <model_id>
 
 **Via Docker**
 ```
-docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./ollama-run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack-local-cpu --yaml_config /root/llamastack-run-ollama.yaml
+docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
 ```
 
-Make sure in you `ollama-run.yaml` file, you inference provider is pointing to the correct Ollama endpoint. E.g.
+Make sure in you `run.yaml` file, you inference provider is pointing to the correct Ollama endpoint. E.g.
 ```
 inference:
   - provider_id: ollama0
diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml
index d14091814..c27f40929 100644
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@@ -1,4 +1,4 @@
-name: distribution-ollama
+name: ollama
 distribution_spec:
   description: Use ollama for running LLM inference
   providers:
@@ -10,4 +10,4 @@ distribution_spec:
     safety: meta-reference
     agents: meta-reference
     telemetry: meta-reference
-image_type: conda
+image_type: docker
diff --git a/distributions/ollama/gpu/compose.yaml b/distributions/ollama/gpu/compose.yaml
index 2e3f85e45..7f9663a8d 100644
--- a/distributions/ollama/gpu/compose.yaml
+++ b/distributions/ollama/gpu/compose.yaml
@@ -33,7 +33,7 @@ services:
     volumes:
       - ~/.llama:/root/.llama
       # Link to ollama run.yaml file
-      - ./ollama-run.yaml:/root/llamastack-run-ollama.yaml
+      - ./run.yaml:/root/llamastack-run-ollama.yaml
     ports:
       - "5000:5000"
     # Hack: wait for ollama server to start before starting docker
diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml
index c3950e900..2c0ca1d33 100644
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@@ -1,4 +1,4 @@
-name: distribution-tgi
+name: tgi
 distribution_spec:
   description: Use TGI for running LLM inference
   providers:
@@ -10,4 +10,4 @@ distribution_spec:
     safety: meta-reference
     agents: meta-reference
     telemetry: meta-reference
-image_type: conda
+image_type: docker
diff --git a/distributions/tgi/cpu/compose.yaml b/distributions/tgi/cpu/compose.yaml
index df7c74489..2ec10b86c 100644
--- a/distributions/tgi/cpu/compose.yaml
+++ b/distributions/tgi/cpu/compose.yaml
@@ -6,28 +6,7 @@ services:
       - $HOME/.cache/huggingface:/data
     ports:
       - "5009:5009"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
     command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
     runtime: nvidia
     healthcheck:
       test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
diff --git a/distributions/together/README.md b/distributions/together/README.md
new file mode 100644
index 000000000..227c7a450
--- /dev/null
+++ b/distributions/together/README.md
@@ -0,0 +1,68 @@
+# Together Distribution
+
+### Connect to a Llama Stack Together Endpoint
+- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
+
+The `llamastack/distribution-together` distribution consists of the following provider configurations.
+
+
+| **API**         	| **Inference** 	| **Agents**     	| **Memory**                                       	| **Safety**     	| **Telemetry**  	|
+|-----------------	|---------------	|----------------	|--------------------------------------------------	|----------------	|----------------	|
+| **Provider(s)** 	| remote::together   	| meta-reference 	| remote::weaviate 	| meta-reference 	| meta-reference 	|
+
+
+### Start the Distribution (Single Node CPU)
+
+> [!NOTE]
+> This assumes you have an hosted endpoint at Together with API Key.
+
+```
+$ cd llama-stack/distribution/together
+$ ls
+compose.yaml  run.yaml
+$ docker compose up
+```
+
+Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
+```
+inference:
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: <optional api key>
+```
+
+### (Alternative) TGI server + llama stack run (Single Node GPU)
+
+```
+docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-together --yaml_config /root/my-run.yaml
+```
+
+Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g.
+```
+inference:
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: <optional api key>
+```
+
+Together distribution comes with weaviate as Memory provider. We also need to configure the remote weaviate API key and URL in `run.yaml` to get memory API.
+```
+memory:
+  - provider_id: meta0
+    provider_type: remote::weaviate
+    config:
+      weaviate_api_key: <ENTER_WEAVIATE_API_KEY>
+      weaviate_cluster_url: <ENTER_WEAVIATE_CLUSTER_URL>
+```
+
+**Via Conda**
+
+```bash
+llama stack build --config ./build.yaml
+# -- modify run.yaml to a valid Together server endpoint
+llama stack run ./run.yaml
+```
diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml
index 67ba2eefa..49eab859d 100644
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@@ -3,8 +3,8 @@ distribution_spec:
   description: Use Together.ai for running LLM inference
   providers:
     inference: remote::together
-    memory: meta-reference
+    memory: remote::weaviate
     safety: remote::together
     agents: meta-reference
     telemetry: meta-reference
-image_type: conda
+image_type: docker
diff --git a/distributions/together/compose.yaml b/distributions/together/compose.yaml
new file mode 100644
index 000000000..75c96b686
--- /dev/null
+++ b/distributions/together/compose.yaml
@@ -0,0 +1,18 @@
+services:
+  llamastack:
+    image: llamastack/distribution-together
+    network_mode: "host"
+    volumes:
+      - ~/.llama:/root/.llama
+      # Link to ollama run.yaml file
+      - ./run.yaml:/root/llamastack-run-together.yaml
+    ports:
+      - "5000:5000"
+    # Hack: wait for ollama server to start before starting docker
+    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml"
+    deploy:
+      restart_policy:
+        condition: on-failure
+        delay: 3s
+        max_attempts: 5
+        window: 60s
diff --git a/distributions/together/run.yaml b/distributions/together/run.yaml
new file mode 100644
index 000000000..355080f61
--- /dev/null
+++ b/distributions/together/run.yaml
@@ -0,0 +1,42 @@
+version: '2'
+built_at: '2024-10-08T17:40:45.325529'
+image_name: local
+docker_image: null
+conda_env: local
+apis:
+- shields
+- agents
+- models
+- memory
+- memory_banks
+- inference
+- safety
+providers:
+  inference:
+  - provider_id: together0
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+  safety:
+  - provider_id: together0
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+  memory:
+  - provider_id: meta0
+    provider_type: remote::weaviate
+    config:
+      weaviate_api_key: <ENTER_WEAVIATE_API_KEY>
+      weaviate_cluster_url: <ENTER_WEAVIATE_CLUSTER_URL>
+  agents:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config:
+      persistence_store:
+        namespace: null
+        type: sqlite
+        db_path: ~/.llama/runtime/kvstore.db
+  telemetry:
+  - provider_id: meta0
+    provider_type: meta-reference
+    config: {}
diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh
index 056a7c06c..19f3df1e3 100755
--- a/llama_stack/distribution/build_container.sh
+++ b/llama_stack/distribution/build_container.sh
@@ -15,7 +15,7 @@ special_pip_deps="$6"
 set -euo pipefail
 
 build_name="$1"
-image_name="llamastack-$build_name"
+image_name="distribution-$build_name"
 docker_base=$2
 build_file_path=$3
 host_build_dir=$4
diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py
index c54cf5939..5a09b6af5 100644
--- a/llama_stack/providers/registry/inference.py
+++ b/llama_stack/providers/registry/inference.py
@@ -55,7 +55,7 @@ def available_providers() -> List[ProviderSpec]:
             api=Api.inference,
             adapter=AdapterSpec(
                 adapter_type="ollama",
-                pip_packages=["ollama"],
+                pip_packages=["ollama", "aiohttp"],
                 config_class="llama_stack.providers.adapters.inference.ollama.OllamaImplConfig",
                 module="llama_stack.providers.adapters.inference.ollama",
             ),