From cbb423a32f1b1c26316c0671b8d3d2fe411f9f46 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Fri, 18 Oct 2024 17:21:50 -0700 Subject: [PATCH] move distribution/templates to distributions/ --- distributions/bedrock/build.yaml | 10 +++++ distributions/databricks/build.yaml | 10 +++++ distributions/fireworks/build.yaml | 10 +++++ distributions/hf-endpoint/build.yaml | 10 +++++ distributions/hf-serverless/build.yaml | 10 +++++ distributions/meta-reference-gpu/build.yaml | 13 ++++++ distributions/meta-reference-gpu/run.yaml | 50 +++++++++++++++++++++ distributions/tgi/README.md | 11 ++++- distributions/together/build.yaml | 10 +++++ distributions/vllm/build.yaml | 10 +++++ 10 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 distributions/bedrock/build.yaml create mode 100644 distributions/databricks/build.yaml create mode 100644 distributions/fireworks/build.yaml create mode 100644 distributions/hf-endpoint/build.yaml create mode 100644 distributions/hf-serverless/build.yaml create mode 100644 distributions/meta-reference-gpu/build.yaml create mode 100644 distributions/meta-reference-gpu/run.yaml create mode 100644 distributions/together/build.yaml create mode 100644 distributions/vllm/build.yaml diff --git a/distributions/bedrock/build.yaml b/distributions/bedrock/build.yaml new file mode 100644 index 000000000..ae7b27d49 --- /dev/null +++ b/distributions/bedrock/build.yaml @@ -0,0 +1,10 @@ +name: bedrock +distribution_spec: + description: Use Amazon Bedrock APIs. + providers: + inference: remote::bedrock + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda diff --git a/distributions/databricks/build.yaml b/distributions/databricks/build.yaml new file mode 100644 index 000000000..2188dd0a0 --- /dev/null +++ b/distributions/databricks/build.yaml @@ -0,0 +1,10 @@ +name: databricks +distribution_spec: + description: Use Databricks for running LLM inference + providers: + inference: remote::databricks + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml new file mode 100644 index 000000000..831643ff1 --- /dev/null +++ b/distributions/fireworks/build.yaml @@ -0,0 +1,10 @@ +name: fireworks +distribution_spec: + description: Use Fireworks.ai for running LLM inference + providers: + inference: remote::fireworks + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda diff --git a/distributions/hf-endpoint/build.yaml b/distributions/hf-endpoint/build.yaml new file mode 100644 index 000000000..750bebcb5 --- /dev/null +++ b/distributions/hf-endpoint/build.yaml @@ -0,0 +1,10 @@ +name: hf-endpoint +distribution_spec: + description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints." + providers: + inference: remote::hf::endpoint + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda diff --git a/distributions/hf-serverless/build.yaml b/distributions/hf-serverless/build.yaml new file mode 100644 index 000000000..f6da3ad4d --- /dev/null +++ b/distributions/hf-serverless/build.yaml @@ -0,0 +1,10 @@ +name: hf-serverless +distribution_spec: + description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference." + providers: + inference: remote::hf::serverless + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml new file mode 100644 index 000000000..e76197330 --- /dev/null +++ b/distributions/meta-reference-gpu/build.yaml @@ -0,0 +1,13 @@ +name: meta-reference-gpu +distribution_spec: + description: Use code from `llama_stack` itself to serve all llama stack APIs + providers: + inference: meta-reference + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: docker diff --git a/distributions/meta-reference-gpu/run.yaml b/distributions/meta-reference-gpu/run.yaml new file mode 100644 index 000000000..724ca030a --- /dev/null +++ b/distributions/meta-reference-gpu/run.yaml @@ -0,0 +1,50 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: meta0 + provider_type: meta-reference + config: + model: Llama3.1-8B-Instruct + quantization: null + torch_seed: null + max_seq_len: 4096 + max_batch_size: 1 + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/distributions/tgi/README.md b/distributions/tgi/README.md index 0aefaf374..86d2636d7 100644 --- a/distributions/tgi/README.md +++ b/distributions/tgi/README.md @@ -40,7 +40,7 @@ docker compose down ### Start the Distribution (Single Node CPU) > [!NOTE] -> This assumes you have an hosted endpoint +> This assumes you have an hosted endpoint compatible with TGI server. ``` $ cd llama-stack/distribution/tgi/cpu @@ -49,6 +49,15 @@ compose.yaml run.yaml $ docker compose up ``` +Replace in `run.yaml` file with your TGI endpoint. +``` +inference: + - provider_id: tgi0 + provider_type: remote::tgi + config: + url: +``` + ### (Alternative) TGI server + llama stack run (Single Node GPU) If you wish to separately spin up a TGI server, and connect with Llama Stack, you may use the following commands. diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml new file mode 100644 index 000000000..67ba2eefa --- /dev/null +++ b/distributions/together/build.yaml @@ -0,0 +1,10 @@ +name: together +distribution_spec: + description: Use Together.ai for running LLM inference + providers: + inference: remote::together + memory: meta-reference + safety: remote::together + agents: meta-reference + telemetry: meta-reference +image_type: conda diff --git a/distributions/vllm/build.yaml b/distributions/vllm/build.yaml new file mode 100644 index 000000000..814fafd32 --- /dev/null +++ b/distributions/vllm/build.yaml @@ -0,0 +1,10 @@ +name: vllm +distribution_spec: + description: Like local, but use vLLM for running LLM inference + providers: + inference: vllm + memory: meta-reference + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: conda