forked from phoenix-oss/llama-stack-mirror
Auto-generate distro yamls + docs (#468)
# What does this PR do? Automatically generates - build.yaml - run.yaml - run-with-safety.yaml - parts of markdown docs for the distributions. ## Test Plan At this point, this only updates the YAMLs and the docs. Some testing (especially with ollama and vllm) has been performed but needs to be much more tested.
This commit is contained in:
parent
0784284ab5
commit
2a31163178
88 changed files with 3008 additions and 852 deletions
|
@ -1,5 +1,4 @@
|
|||
version: '2'
|
||||
built_at: '2024-11-01T17:40:45.325529'
|
||||
image_name: local
|
||||
name: bedrock
|
||||
docker_image: null
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: fireworks0
|
||||
provider_type: remote::fireworks
|
||||
config:
|
||||
url: https://api.fireworks.ai/inference
|
||||
# api_key: <ENTER_YOUR_API_KEY>
|
||||
safety:
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
# Uncomment to use weaviate memory provider
|
||||
# - provider_id: weaviate0
|
||||
# provider_type: remote::weaviate
|
||||
# config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
1
distributions/fireworks/run.yaml
Symbolic link
1
distributions/fireworks/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/fireworks/run.yaml
|
|
@ -1,5 +1,4 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
|
|
1
distributions/meta-reference-gpu/run-with-safety.yaml
Symbolic link
1
distributions/meta-reference-gpu/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
|
|
@ -1,69 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: inference0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
model: Llama3.2-3B-Instruct
|
||||
quantization: null
|
||||
torch_seed: null
|
||||
max_seq_len: 4096
|
||||
max_batch_size: 1
|
||||
- provider_id: inference1
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
quantization: null
|
||||
torch_seed: null
|
||||
max_seq_len: 2048
|
||||
max_batch_size: 1
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
# Uncomment to use prompt guard
|
||||
# prompt_guard_shield:
|
||||
# model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
# Uncomment to use pgvector
|
||||
# - provider_id: pgvector
|
||||
# provider_type: remote::pgvector
|
||||
# config:
|
||||
# host: 127.0.0.1
|
||||
# port: 5432
|
||||
# db: postgres
|
||||
# user: postgres
|
||||
# password: mysecretpassword
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/agents_store.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
1
distributions/meta-reference-gpu/run.yaml
Symbolic link
1
distributions/meta-reference-gpu/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/meta-reference-gpu/run.yaml
|
|
@ -1,5 +1,4 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
|
@ -13,20 +12,15 @@ apis:
|
|||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: ollama0
|
||||
- provider_id: ollama
|
||||
provider_type: remote::ollama
|
||||
config:
|
||||
url: http://127.0.0.1:14343
|
||||
url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
|
@ -43,3 +37,10 @@ providers:
|
|||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
models:
|
||||
- model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
|
||||
provider_id: ollama
|
||||
- model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
|
||||
provider_id: ollama
|
||||
shields:
|
||||
- shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
|
||||
|
|
|
@ -1,30 +1,71 @@
|
|||
services:
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
network_mode: "host"
|
||||
network_mode: ${NETWORK_MODE:-bridge}
|
||||
volumes:
|
||||
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
|
||||
- ~/.ollama:/root/.ollama
|
||||
ports:
|
||||
- "11434:11434"
|
||||
environment:
|
||||
OLLAMA_DEBUG: 1
|
||||
command: []
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G # Set maximum memory
|
||||
reservations:
|
||||
memory: 8G # Set minimum memory reservation
|
||||
# healthcheck:
|
||||
# # ugh, no CURL in ollama image
|
||||
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
|
||||
# interval: 10s
|
||||
# timeout: 5s
|
||||
# retries: 5
|
||||
|
||||
ollama-init:
|
||||
image: ollama/ollama:latest
|
||||
depends_on:
|
||||
- ollama
|
||||
# condition: service_healthy
|
||||
network_mode: ${NETWORK_MODE:-bridge}
|
||||
environment:
|
||||
- OLLAMA_HOST=ollama
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
||||
volumes:
|
||||
- ~/.ollama:/root/.ollama
|
||||
- ./pull-models.sh:/pull-models.sh
|
||||
entrypoint: ["/pull-models.sh"]
|
||||
|
||||
llamastack:
|
||||
depends_on:
|
||||
- ollama
|
||||
image: llamastack/distribution-ollama
|
||||
network_mode: "host"
|
||||
ollama:
|
||||
condition: service_started
|
||||
ollama-init:
|
||||
condition: service_started
|
||||
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
|
||||
network_mode: ${NETWORK_MODE:-bridge}
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
# Link to ollama run.yaml file
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
- ~/local/llama-stack/:/app/llama-stack-source
|
||||
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "5000:5000"
|
||||
# Hack: wait for ollama server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
||||
environment:
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
||||
- OLLAMA_URL=http://ollama:11434
|
||||
entrypoint: >
|
||||
python -m llama_stack.distribution.server.server /root/my-run.yaml \
|
||||
--port ${LLAMA_STACK_PORT:-5001}
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
delay: 3s
|
||||
max_attempts: 5
|
||||
delay: 10s
|
||||
max_attempts: 3
|
||||
window: 60s
|
||||
volumes:
|
||||
ollama:
|
||||
ollama-init:
|
||||
llamastack:
|
||||
|
|
18
distributions/ollama/pull-models.sh
Executable file
18
distributions/ollama/pull-models.sh
Executable file
|
@ -0,0 +1,18 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
|
||||
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
|
||||
echo "Preloading $model..."
|
||||
if ! ollama run "$model"; then
|
||||
echo "Failed to pull and run $model"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
echo "All models pulled successfully"
|
1
distributions/ollama/run-with-safety.yaml
Symbolic link
1
distributions/ollama/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/ollama/run-with-safety.yaml
|
|
@ -1,45 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: ollama0
|
||||
provider_type: remote::ollama
|
||||
config:
|
||||
url: http://127.0.0.1:14343
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
1
distributions/ollama/run.yaml
Symbolic link
1
distributions/ollama/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/ollama/run.yaml
|
|
@ -1,33 +1,28 @@
|
|||
# NOTES:
|
||||
#
|
||||
# This Docker Compose (and the associated run.yaml) assumes you will be
|
||||
# running in the default "bridged" network mode.
|
||||
#
|
||||
# If you need "host" network mode, please uncomment
|
||||
# - network_mode: "host"
|
||||
#
|
||||
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
|
||||
#
|
||||
services:
|
||||
vllm-0:
|
||||
vllm-inference:
|
||||
image: vllm/vllm-openai:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
# network_mode: "host"
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "5100:5100"
|
||||
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model meta-llama/Llama-3.1-8B-Instruct
|
||||
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port 5100
|
||||
--port ${VLLM_INFERENCE_PORT:-5100}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
|
@ -35,25 +30,34 @@ services:
|
|||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
vllm-1:
|
||||
|
||||
# A little trick:
|
||||
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
||||
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
||||
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||
image: vllm/vllm-openai:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||
# network_mode: "host"
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "5101:5101"
|
||||
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=1
|
||||
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||
command: >
|
||||
--gpu-memory-utilization 0.75
|
||||
--model meta-llama/Llama-Guard-3-1B
|
||||
--model ${VLLM_SAFETY_MODEL}
|
||||
--enforce-eager
|
||||
--max-model-len 8192
|
||||
--max-num-seqs 16
|
||||
--port 5101
|
||||
--port ${VLLM_SAFETY_PORT:-5101}
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
|
@ -63,23 +67,25 @@ services:
|
|||
runtime: nvidia
|
||||
llamastack:
|
||||
depends_on:
|
||||
- vllm-0
|
||||
- vllm-1
|
||||
# image: llamastack/distribution-remote-vllm
|
||||
- vllm-inference:
|
||||
condition: service_healthy
|
||||
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||
condition: service_healthy
|
||||
# image: llamastack/distribution-remote-vllm
|
||||
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
- ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
|
||||
# network_mode: "host"
|
||||
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
environment:
|
||||
- LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
|
||||
- LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
||||
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
||||
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
||||
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
||||
- LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
|
||||
- LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
ports:
|
||||
- "5001:5001"
|
||||
- "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
|
||||
# Hack: wait for vLLM server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
|
||||
deploy:
|
||||
|
@ -89,6 +95,6 @@ services:
|
|||
max_attempts: 5
|
||||
window: 60s
|
||||
volumes:
|
||||
vllm-0:
|
||||
vllm-1:
|
||||
vllm-inference:
|
||||
vllm-safety:
|
||||
llamastack:
|
||||
|
|
1
distributions/remote-vllm/run-with-safety.yaml
Symbolic link
1
distributions/remote-vllm/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/remote-vllm/run-with-safety.yaml
|
|
@ -1,68 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-11-11T20:09:45.988375'
|
||||
image_name: remote-vllm
|
||||
docker_image: remote-vllm
|
||||
conda_env: null
|
||||
apis:
|
||||
- inference
|
||||
- memory
|
||||
- safety
|
||||
- agents
|
||||
- telemetry
|
||||
providers:
|
||||
inference:
|
||||
# serves main inference model
|
||||
- provider_id: vllm-0
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
# NOTE: replace with "localhost" if you are running in "host" network mode
|
||||
url: ${env.LLAMA_INFERENCE_VLLM_URL:http://host.docker.internal:5100/v1}
|
||||
max_tokens: ${env.MAX_TOKENS:4096}
|
||||
api_token: fake
|
||||
# serves safety llama_guard model
|
||||
- provider_id: vllm-1
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
# NOTE: replace with "localhost" if you are running in "host" network mode
|
||||
url: ${env.LLAMA_SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
|
||||
max_tokens: ${env.MAX_TOKENS:4096}
|
||||
api_token: fake
|
||||
memory:
|
||||
- provider_id: faiss-0
|
||||
provider_type: inline::faiss
|
||||
config:
|
||||
kvstore:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/faiss_store.db"
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config: {}
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::faiss
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/agents_store.db"
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
metadata_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: "${env.SQLITE_STORE_DIR:/home/ashwin/.llama/distributions/remote-vllm}/registry.db"
|
||||
models:
|
||||
- model_id: ${env.LLAMA_INFERENCE_MODEL:Llama3.1-8B-Instruct}
|
||||
provider_id: vllm-0
|
||||
- model_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
|
||||
provider_id: vllm-1
|
||||
shields:
|
||||
- shield_id: ${env.LLAMA_SAFETY_MODEL:Llama-Guard-3-1B}
|
1
distributions/remote-vllm/run.yaml
Symbolic link
1
distributions/remote-vllm/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/remote-vllm/run.yaml
|
|
@ -1,51 +1,89 @@
|
|||
services:
|
||||
text-generation-inference:
|
||||
tgi-inference:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "5009:5009"
|
||||
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--port ${TGI_INFERENCE_PORT:-8080}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# that's the closest analogue to --gpus; provide
|
||||
# an integer amount of devices or 'all'
|
||||
count: 1
|
||||
# Devices are reserved using a list of capabilities, making
|
||||
# capabilities the only required field. A device MUST
|
||||
# satisfy all the requested capabilities for a successful
|
||||
# reservation.
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
--port ${TGI_SAFETY_PORT:-8081}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
|
||||
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
llamastack:
|
||||
depends_on:
|
||||
text-generation-inference:
|
||||
tgi-inference:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi
|
||||
network_mode: "host"
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi:test-0.0.52rc3
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
# Link to TGI run.yaml file
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "5000:5000"
|
||||
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
||||
# Hack: wait for TGI server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
restart_policy:
|
||||
|
@ -53,3 +91,13 @@ services:
|
|||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
||||
environment:
|
||||
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
||||
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
|
||||
volumes:
|
||||
tgi-inference:
|
||||
tgi-safety:
|
||||
llamastack:
|
||||
|
|
1
distributions/tgi/run-with-safety.yaml
Symbolic link
1
distributions/tgi/run-with-safety.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/tgi/run-with-safety.yaml
|
|
@ -1,45 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: tgi0
|
||||
provider_type: remote::tgi
|
||||
config:
|
||||
url: http://127.0.0.1:5009
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
1
distributions/tgi/run.yaml
Symbolic link
1
distributions/tgi/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/tgi/run.yaml
|
|
@ -1,46 +0,0 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: together0
|
||||
provider_type: remote::together
|
||||
config:
|
||||
url: https://api.together.xyz/v1
|
||||
# api_key: <ENTER_YOUR_API_KEY>
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: remote::weaviate
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
1
distributions/together/run.yaml
Symbolic link
1
distributions/together/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
|||
../../llama_stack/templates/together/run.yaml
|
Loading…
Add table
Add a link
Reference in a new issue