fix broken --list-templates with adding build.yaml files for packaging (#327)

* add build files to templates

* fix templates

* manifest

* symlink

* symlink

* precommit

* change everything to docker build.yaml

* remove image_type in templates

* fix build from templates CLI

* fix readmes
This commit is contained in:
Xi Yan 2024-10-25 12:51:22 -07:00 committed by GitHub
parent afae4e3d8e
commit 07f9bf723f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 161 additions and 158 deletions

View file

@ -1,4 +1,4 @@
include requirements.txt
include llama_stack/distribution/*.sh
include llama_stack/cli/scripts/*.sh
include distributions/*/build.yaml
include llama_stack/templates/*/build.yaml

View file

@ -1,10 +0,0 @@
name: bedrock
distribution_spec:
description: Use Amazon Bedrock APIs.
providers:
inference: remote::bedrock
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/bedrock/build.yaml

View file

@ -1,10 +0,0 @@
name: databricks
distribution_spec:
description: Use Databricks for running LLM inference
providers:
inference: remote::databricks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/databricks/build.yaml

View file

@ -49,7 +49,7 @@ inference:
**Via Conda**
```bash
llama stack build --config ./build.yaml
llama stack build --template fireworks --image-type conda
# -- modify run.yaml to a valid Fireworks server endpoint
llama stack run ./run.yaml
```

View file

@ -1,10 +0,0 @@
name: fireworks
distribution_spec:
description: Use Fireworks.ai for running LLM inference
providers:
inference: remote::fireworks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/fireworks/build.yaml

View file

@ -1,10 +0,0 @@
name: hf-endpoint
distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
providers:
inference: remote::hf::endpoint
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/hf-endpoint/build.yaml

View file

@ -1,10 +0,0 @@
name: hf-serverless
distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
providers:
inference: remote::hf::serverless
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/hf-serverless/build.yaml

View file

@ -1,14 +0,0 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-gpu/build.yaml

View file

@ -1,14 +0,0 @@
name: meta-reference-quantized-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference-quantized
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml

View file

@ -86,6 +86,6 @@ inference:
**Via Conda**
```
llama stack build --config ./build.yaml
llama stack build --template ollama --image-type conda
llama stack run ./gpu/run.yaml
```

View file

@ -1,13 +0,0 @@
name: ollama
distribution_spec:
description: Use ollama for running LLM inference
providers:
inference: remote::ollama
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/ollama/build.yaml

View file

@ -88,7 +88,7 @@ inference:
**Via Conda**
```bash
llama stack build --config ./build.yaml
llama stack build --template tgi --image-type conda
# -- start a TGI server endpoint
llama stack run ./gpu/run.yaml
```

View file

@ -1,13 +0,0 @@
name: tgi
distribution_spec:
description: Use TGI for running LLM inference
providers:
inference: remote::tgi
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/tgi/build.yaml

View file

@ -62,7 +62,7 @@ memory:
**Via Conda**
```bash
llama stack build --config ./build.yaml
llama stack build --template together --image-type conda
# -- modify run.yaml to a valid Together server endpoint
llama stack run ./run.yaml
```

View file

@ -1,10 +0,0 @@
name: together
distribution_spec:
description: Use Together.ai for running LLM inference
providers:
inference: remote::together
memory: remote::weaviate
safety: remote::together
agents: meta-reference
telemetry: meta-reference
image_type: docker

View file

@ -0,0 +1 @@
../../llama_stack/templates/together/build.yaml

View file

@ -1,10 +0,0 @@
name: vllm
distribution_spec:
description: Like local, but use vLLM for running LLM inference
providers:
inference: vllm
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference
image_type: conda

View file

@ -0,0 +1 @@
../../llama_stack/templates/vllm/build.yaml

View file

@ -279,11 +279,11 @@ llama stack build --list-templates
You may then pick a template to build your distribution with providers fitted to your liking.
```
llama stack build --template local-tgi --name my-tgi-stack
llama stack build --template local-tgi --name my-tgi-stack --image-type conda
```
```
$ llama stack build --template local-tgi --name my-tgi-stack
$ llama stack build --template local-tgi --name my-tgi-stack --image-type conda
...
...
Build spec configuration saved at ~/.conda/envs/llamastack-my-tgi-stack/my-tgi-stack-build.yaml
@ -293,10 +293,10 @@ You may now run `llama stack configure my-tgi-stack` or `llama stack configure ~
#### Building from config file
- In addition to templates, you may customize the build to your liking through editing config files and build from config files with the following command.
- The config file will be of contents like the ones in `llama_stack/distributions/templates/`.
- The config file will be of contents like the ones in `llama_stack/templates/`.
```
$ cat llama_stack/distribution/templates/local-ollama-build.yaml
$ cat build.yaml
name: local-ollama
distribution_spec:
@ -311,7 +311,7 @@ image_type: conda
```
```
llama stack build --config llama_stack/distribution/templates/local-ollama-build.yaml
llama stack build --config build.yaml
```
#### How to build distribution with Docker image

View file

@ -35,11 +35,7 @@ You have two ways to start up Llama stack server:
1. **Starting up server via docker**:
We provide 2 pre-built Docker image of Llama Stack distribution, which can be found in the following links.
- [llamastack-local-gpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-gpu/general)
- This is a packaged version with our local meta-reference implementations, where you will be running inference locally with downloaded Llama model checkpoints.
- [llamastack-local-cpu](https://hub.docker.com/repository/docker/llamastack/llamastack-local-cpu/general)
- This is a lite version with remote inference where you can hook up to your favourite remote inference framework (e.g. ollama, fireworks, together, tgi) for running inference without GPU.
We provide pre-built Docker image of Llama Stack distribution, which can be found in the following links in the [distributions](../distributions/) folder.
> [!NOTE]
> For GPU inference, you need to set these environment variables for specifying local directory containing your model checkpoints, and enable GPU inference to start running docker container.

View file

@ -12,9 +12,7 @@ import os
from functools import lru_cache
from pathlib import Path
TEMPLATES_PATH = (
Path(os.path.relpath(__file__)).parent.parent.parent.parent / "distributions"
)
TEMPLATES_PATH = Path(os.path.relpath(__file__)).parent.parent.parent / "templates"
@lru_cache()
@ -26,7 +24,6 @@ def available_templates_specs() -> List[BuildConfig]:
with open(p, "r") as f:
build_config = BuildConfig(**yaml.safe_load(f))
template_specs.append(build_config)
return template_specs
@ -99,19 +96,22 @@ class StackBuild(Subcommand):
"You must specify a name for the build using --name when using a template"
)
return
build_path = TEMPLATES_PATH / f"{args.template}-build.yaml"
if not build_path.exists():
self.parser.error(
f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
)
return
with open(build_path, "r") as f:
build_config = BuildConfig(**yaml.safe_load(f))
build_config.name = args.name
if args.image_type:
build_config.image_type = args.image_type
self._run_stack_build_command_from_build_config(build_config)
available_templates = available_templates_specs()
for build_config in available_templates:
if build_config.name == args.template:
build_config.name = args.name
if args.image_type:
build_config.image_type = args.image_type
else:
self.parser.error(
f"Please specify a image-type (docker | conda) for {args.template}"
)
self._run_stack_build_command_from_build_config(build_config)
return
self.parser.error(
f"Could not find template {args.template}. Please run `llama stack build --list-templates` to check out the available templates"
)
return
# try to see if we can find a pre-existing build config file through name

View file

@ -8,18 +8,19 @@ from enum import Enum
from typing import List, Optional
import pkg_resources
from llama_stack.distribution.utils.exec import run_with_pty
from pydantic import BaseModel
from termcolor import cprint
from llama_stack.distribution.utils.exec import run_with_pty
from llama_stack.distribution.datatypes import * # noqa: F403
from pathlib import Path
from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
from llama_stack.distribution.distribution import get_provider_registry
from llama_stack.distribution.utils.config_dirs import BUILDS_BASE_DIR
# These are the dependencies needed by the distribution server.
# `llama-stack` is automatically installed by the installation script.

View file

@ -1,5 +1,11 @@
#!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
if [[ $# -ne 1 ]]; then
echo "Error: Please provide the name of CONDA environment you wish to create"
exit 1

View file

@ -0,0 +1,9 @@
name: bedrock
distribution_spec:
description: Use Amazon Bedrock APIs.
providers:
inference: remote::bedrock
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: databricks
distribution_spec:
description: Use Databricks for running LLM inference
providers:
inference: remote::databricks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: fireworks
distribution_spec:
description: Use Fireworks.ai for running LLM inference
providers:
inference: remote::fireworks
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: hf-endpoint
distribution_spec:
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
providers:
inference: remote::hf::endpoint
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: hf-serverless
distribution_spec:
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
providers:
inference: remote::hf::serverless
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,13 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,13 @@
name: meta-reference-quantized-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference-quantized
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,12 @@
name: ollama
distribution_spec:
description: Use ollama for running LLM inference
providers:
inference: remote::ollama
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,12 @@
name: tgi
distribution_spec:
description: Use TGI for running LLM inference
providers:
inference: remote::tgi
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: together
distribution_spec:
description: Use Together.ai for running LLM inference
providers:
inference: remote::together
memory: remote::weaviate
safety: remote::together
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,9 @@
name: vllm
distribution_spec:
description: Like local, but use vLLM for running LLM inference
providers:
inference: vllm
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference