From 4d2bd2d39ed8bf85bb20ed5af52090d300ecb5e0 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 21 Oct 2024 18:15:08 -0700 Subject: [PATCH] add more distro templates (#279) * verify dockers * together distro verified * readme * fireworks distro * fireworks compose up * fireworks verified --- distributions/README.md | 2 + distributions/fireworks/README.md | 55 +++++++++++++++++ distributions/fireworks/build.yaml | 2 +- distributions/fireworks/compose.yaml | 18 ++++++ distributions/fireworks/run.yaml | 46 ++++++++++++++ distributions/meta-reference-gpu/README.md | 11 +--- distributions/meta-reference-gpu/build.yaml | 2 +- distributions/ollama/README.md | 4 +- distributions/ollama/build.yaml | 4 +- distributions/ollama/gpu/compose.yaml | 2 +- distributions/tgi/build.yaml | 4 +- distributions/tgi/cpu/compose.yaml | 21 ------- distributions/together/README.md | 68 +++++++++++++++++++++ distributions/together/build.yaml | 4 +- distributions/together/compose.yaml | 18 ++++++ distributions/together/run.yaml | 42 +++++++++++++ llama_stack/distribution/build_container.sh | 2 +- llama_stack/providers/registry/inference.py | 2 +- 18 files changed, 265 insertions(+), 42 deletions(-) create mode 100644 distributions/fireworks/README.md create mode 100644 distributions/fireworks/compose.yaml create mode 100644 distributions/fireworks/run.yaml create mode 100644 distributions/together/README.md create mode 100644 distributions/together/compose.yaml create mode 100644 distributions/together/run.yaml diff --git a/distributions/README.md b/distributions/README.md index 92640210b..1802f0c9d 100644 --- a/distributions/README.md +++ b/distributions/README.md @@ -9,3 +9,5 @@ A Distribution is where APIs and Providers are assembled together to provide a c | Meta Reference | llamastack/distribution-meta-reference-gpu | [Guide](./meta-reference-gpu/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | Ollama | llamastack/distribution-ollama | [Guide](./ollama/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | TGI | llamastack/distribution-tgi | [Guide](./tgi/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| Together | llamastack/distribution-together | [Guide](./together/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| Fireworks | llamastack/distribution-fireworks | [Guide](./fireworks/) | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | diff --git a/distributions/fireworks/README.md b/distributions/fireworks/README.md new file mode 100644 index 000000000..fcf74d809 --- /dev/null +++ b/distributions/fireworks/README.md @@ -0,0 +1,55 @@ +# Fireworks Distribution + +The `llamastack/distribution-` distribution consists of the following provider configurations. + + +| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- | +| **Provider(s)** | remote::fireworks | meta-reference | meta-reference | meta-reference | meta-reference | + + +### Start the Distribution (Single Node CPU) + +> [!NOTE] +> This assumes you have an hosted endpoint at Fireworks with API Key. + +``` +$ cd llama-stack/distribution/fireworks +$ ls +compose.yaml run.yaml +$ docker compose up +``` + +Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g. +``` +inference: + - provider_id: fireworks + provider_type: remote::fireworks + config: + url: https://api.fireworks.ai/inferenc + api_key: +``` + +### (Alternative) TGI server + llama stack run (Single Node GPU) + +``` +docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-fireworks --yaml_config /root/my-run.yaml +``` + +Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g. +``` +inference: + - provider_id: fireworks + provider_type: remote::fireworks + config: + url: https://api.fireworks.ai/inference + api_key: +``` + +**Via Conda** + +```bash +llama stack build --config ./build.yaml +# -- modify run.yaml to a valid Fireworks server endpoint +llama stack run ./run.yaml +``` diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml index 831643ff1..2e5cf0753 100644 --- a/distributions/fireworks/build.yaml +++ b/distributions/fireworks/build.yaml @@ -7,4 +7,4 @@ distribution_spec: safety: meta-reference agents: meta-reference telemetry: meta-reference -image_type: conda +image_type: docker diff --git a/distributions/fireworks/compose.yaml b/distributions/fireworks/compose.yaml new file mode 100644 index 000000000..552806745 --- /dev/null +++ b/distributions/fireworks/compose.yaml @@ -0,0 +1,18 @@ +services: + llamastack: + image: llamastack/distribution-fireworks + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to ollama run.yaml file + - ./run.yaml:/root/llamastack-run-fireworks.yaml + ports: + - "5000:5000" + # Hack: wait for ollama server to start before starting docker + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/distributions/fireworks/run.yaml b/distributions/fireworks/run.yaml new file mode 100644 index 000000000..c48b0cb7b --- /dev/null +++ b/distributions/fireworks/run.yaml @@ -0,0 +1,46 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: fireworks0 + provider_type: remote::fireworks + config: + url: https://api.fireworks.ai/inference + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/distributions/meta-reference-gpu/README.md b/distributions/meta-reference-gpu/README.md index 951120da5..7f209c4a9 100644 --- a/distributions/meta-reference-gpu/README.md +++ b/distributions/meta-reference-gpu/README.md @@ -11,13 +11,8 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo ### Start the Distribution (Single Node GPU) > [!NOTE] -> This assumes you have access to GPU to start a TGI server with access to your GPU. +> This assumes you have access to GPU to start a local server with access to your GPU. -> [!NOTE] -> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container. -``` -export LLAMA_CHECKPOINT_DIR=~/.llama -``` > [!NOTE] > `~/.llama` should be the path containing downloaded weights of Llama models. @@ -26,8 +21,8 @@ export LLAMA_CHECKPOINT_DIR=~/.llama To download and start running a pre-built docker container, you may use the following commands: ``` -docker run -it -p 5000:5000 -v ~/.llama:/root/.llama --gpus=all llamastack/llamastack-local-gpu +docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml ``` ### Alternative (Build and start distribution locally via conda) -- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on starting up a meta-reference distribution. +- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on building locally via conda and starting up a meta-reference distribution. diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml index ca786c51c..e76197330 100644 --- a/distributions/meta-reference-gpu/build.yaml +++ b/distributions/meta-reference-gpu/build.yaml @@ -1,4 +1,4 @@ -name: distribution-meta-reference-gpu +name: meta-reference-gpu distribution_spec: description: Use code from `llama_stack` itself to serve all llama stack APIs providers: diff --git a/distributions/ollama/README.md b/distributions/ollama/README.md index 43c764cbe..d59c3f9e1 100644 --- a/distributions/ollama/README.md +++ b/distributions/ollama/README.md @@ -71,10 +71,10 @@ ollama run **Via Docker** ``` -docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./ollama-run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack-local-cpu --yaml_config /root/llamastack-run-ollama.yaml +docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml ``` -Make sure in you `ollama-run.yaml` file, you inference provider is pointing to the correct Ollama endpoint. E.g. +Make sure in you `run.yaml` file, you inference provider is pointing to the correct Ollama endpoint. E.g. ``` inference: - provider_id: ollama0 diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml index d14091814..c27f40929 100644 --- a/distributions/ollama/build.yaml +++ b/distributions/ollama/build.yaml @@ -1,4 +1,4 @@ -name: distribution-ollama +name: ollama distribution_spec: description: Use ollama for running LLM inference providers: @@ -10,4 +10,4 @@ distribution_spec: safety: meta-reference agents: meta-reference telemetry: meta-reference -image_type: conda +image_type: docker diff --git a/distributions/ollama/gpu/compose.yaml b/distributions/ollama/gpu/compose.yaml index 2e3f85e45..7f9663a8d 100644 --- a/distributions/ollama/gpu/compose.yaml +++ b/distributions/ollama/gpu/compose.yaml @@ -33,7 +33,7 @@ services: volumes: - ~/.llama:/root/.llama # Link to ollama run.yaml file - - ./ollama-run.yaml:/root/llamastack-run-ollama.yaml + - ./run.yaml:/root/llamastack-run-ollama.yaml ports: - "5000:5000" # Hack: wait for ollama server to start before starting docker diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml index c3950e900..2c0ca1d33 100644 --- a/distributions/tgi/build.yaml +++ b/distributions/tgi/build.yaml @@ -1,4 +1,4 @@ -name: distribution-tgi +name: tgi distribution_spec: description: Use TGI for running LLM inference providers: @@ -10,4 +10,4 @@ distribution_spec: safety: meta-reference agents: meta-reference telemetry: meta-reference -image_type: conda +image_type: docker diff --git a/distributions/tgi/cpu/compose.yaml b/distributions/tgi/cpu/compose.yaml index df7c74489..2ec10b86c 100644 --- a/distributions/tgi/cpu/compose.yaml +++ b/distributions/tgi/cpu/compose.yaml @@ -6,28 +6,7 @@ services: - $HOME/.cache/huggingface:/data ports: - "5009:5009" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - - HF_HOME=/data - - HF_DATASETS_CACHE=/data - - HF_MODULES_CACHE=/data - - HF_HUB_CACHE=/data command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] runtime: nvidia healthcheck: test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"] diff --git a/distributions/together/README.md b/distributions/together/README.md new file mode 100644 index 000000000..227c7a450 --- /dev/null +++ b/distributions/together/README.md @@ -0,0 +1,68 @@ +# Together Distribution + +### Connect to a Llama Stack Together Endpoint +- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution + +The `llamastack/distribution-together` distribution consists of the following provider configurations. + + +| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- | +| **Provider(s)** | remote::together | meta-reference | remote::weaviate | meta-reference | meta-reference | + + +### Start the Distribution (Single Node CPU) + +> [!NOTE] +> This assumes you have an hosted endpoint at Together with API Key. + +``` +$ cd llama-stack/distribution/together +$ ls +compose.yaml run.yaml +$ docker compose up +``` + +Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g. +``` +inference: + - provider_id: together + provider_type: remote::together + config: + url: https://api.together.xyz/v1 + api_key: +``` + +### (Alternative) TGI server + llama stack run (Single Node GPU) + +``` +docker run --network host -it -p 5000:5000 -v ./run.yaml:/root/my-run.yaml --gpus=all llamastack/distribution-together --yaml_config /root/my-run.yaml +``` + +Make sure in you `run.yaml` file, you inference provider is pointing to the correct Together URL server endpoint. E.g. +``` +inference: + - provider_id: together + provider_type: remote::together + config: + url: https://api.together.xyz/v1 + api_key: +``` + +Together distribution comes with weaviate as Memory provider. We also need to configure the remote weaviate API key and URL in `run.yaml` to get memory API. +``` +memory: + - provider_id: meta0 + provider_type: remote::weaviate + config: + weaviate_api_key: + weaviate_cluster_url: +``` + +**Via Conda** + +```bash +llama stack build --config ./build.yaml +# -- modify run.yaml to a valid Together server endpoint +llama stack run ./run.yaml +``` diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml index 67ba2eefa..49eab859d 100644 --- a/distributions/together/build.yaml +++ b/distributions/together/build.yaml @@ -3,8 +3,8 @@ distribution_spec: description: Use Together.ai for running LLM inference providers: inference: remote::together - memory: meta-reference + memory: remote::weaviate safety: remote::together agents: meta-reference telemetry: meta-reference -image_type: conda +image_type: docker diff --git a/distributions/together/compose.yaml b/distributions/together/compose.yaml new file mode 100644 index 000000000..75c96b686 --- /dev/null +++ b/distributions/together/compose.yaml @@ -0,0 +1,18 @@ +services: + llamastack: + image: llamastack/distribution-together + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to ollama run.yaml file + - ./run.yaml:/root/llamastack-run-together.yaml + ports: + - "5000:5000" + # Hack: wait for ollama server to start before starting docker + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/distributions/together/run.yaml b/distributions/together/run.yaml new file mode 100644 index 000000000..355080f61 --- /dev/null +++ b/distributions/together/run.yaml @@ -0,0 +1,42 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: together0 + provider_type: remote::together + config: + url: https://api.together.xyz/v1 + safety: + - provider_id: together0 + provider_type: remote::together + config: + url: https://api.together.xyz/v1 + memory: + - provider_id: meta0 + provider_type: remote::weaviate + config: + weaviate_api_key: + weaviate_cluster_url: + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index 056a7c06c..19f3df1e3 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -15,7 +15,7 @@ special_pip_deps="$6" set -euo pipefail build_name="$1" -image_name="llamastack-$build_name" +image_name="distribution-$build_name" docker_base=$2 build_file_path=$3 host_build_dir=$4 diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index c54cf5939..5a09b6af5 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -55,7 +55,7 @@ def available_providers() -> List[ProviderSpec]: api=Api.inference, adapter=AdapterSpec( adapter_type="ollama", - pip_packages=["ollama"], + pip_packages=["ollama", "aiohttp"], config_class="llama_stack.providers.adapters.inference.ollama.OllamaImplConfig", module="llama_stack.providers.adapters.inference.ollama", ),