forked from phoenix-oss/llama-stack-mirror
chore: remove distributions folder (#1801)
# What does this PR do? - the distribution folder is referencing template, and have dead docker compose scripts [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [//]: # (## Documentation)
This commit is contained in:
parent
f8445b0d69
commit
742020b94a
45 changed files with 0 additions and 742 deletions
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/bedrock/build.yaml
|
|
|
@ -1,15 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: distribution-bedrock
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-bedrock.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/bedrock/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/cerebras/build.yaml
|
|
|
@ -1,16 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-cerebras
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-cerebras.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/cerebras/run.yaml
|
|
|
@ -1,50 +0,0 @@
|
||||||
services:
|
|
||||||
text-generation-inference:
|
|
||||||
image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/data
|
|
||||||
ports:
|
|
||||||
- "5009:5009"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0,1,2,3,4
|
|
||||||
- NUM_SHARD=4
|
|
||||||
- MAX_BATCH_PREFILL_TOKENS=32768
|
|
||||||
- MAX_INPUT_TOKENS=8000
|
|
||||||
- MAX_TOTAL_TOKENS=8192
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: all
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
text-generation-inference:
|
|
||||||
condition: service_healthy
|
|
||||||
image: llamastack/distribution-tgi
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
# Link to TGI run.yaml file
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
# Hack: wait for TGI server to start before starting docker
|
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1,44 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
container_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: tgi0
|
|
||||||
provider_type: remote::tgi
|
|
||||||
config:
|
|
||||||
url: http://127.0.0.1:80
|
|
||||||
safety:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
excluded_categories: []
|
|
||||||
- provider_id: meta1
|
|
||||||
provider_type: inline::prompt-guard
|
|
||||||
config:
|
|
||||||
model: Prompt-Guard-86M
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::faiss
|
|
||||||
config: {}
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
namespace: null
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/kvstore.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/fireworks/build.yaml
|
|
|
@ -1,14 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-fireworks
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
environment:
|
|
||||||
- FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/fireworks/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-gpu/build.yaml
|
|
|
@ -1,34 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-meta-reference-gpu
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: 1
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
||||||
runtime: nvidia
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-gpu/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
|
|
|
@ -1,35 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-meta-reference-quantized-gpu
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: 1
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1,58 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
container_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference-quantized
|
|
||||||
config:
|
|
||||||
model: Llama3.2-3B-Instruct:int4-qlora-eo8
|
|
||||||
quantization:
|
|
||||||
type: int4
|
|
||||||
torch_seed: null
|
|
||||||
max_seq_len: 2048
|
|
||||||
max_batch_size: 1
|
|
||||||
- provider_id: meta1
|
|
||||||
provider_type: inline::meta-reference-quantized
|
|
||||||
config:
|
|
||||||
# not a quantized model !
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
quantization: null
|
|
||||||
torch_seed: null
|
|
||||||
max_seq_len: 2048
|
|
||||||
max_batch_size: 1
|
|
||||||
safety:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
excluded_categories: []
|
|
||||||
- provider_id: meta1
|
|
||||||
provider_type: inline::prompt-guard
|
|
||||||
config:
|
|
||||||
model: Prompt-Guard-86M
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
namespace: null
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/kvstore.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/ollama/build.yaml
|
|
|
@ -1,71 +0,0 @@
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
image: ollama/ollama:latest
|
|
||||||
network_mode: ${NETWORK_MODE:-bridge}
|
|
||||||
volumes:
|
|
||||||
- ~/.ollama:/root/.ollama
|
|
||||||
ports:
|
|
||||||
- "11434:11434"
|
|
||||||
environment:
|
|
||||||
OLLAMA_DEBUG: 1
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
memory: 8G # Set maximum memory
|
|
||||||
reservations:
|
|
||||||
memory: 8G # Set minimum memory reservation
|
|
||||||
# healthcheck:
|
|
||||||
# # ugh, no CURL in ollama image
|
|
||||||
# test: ["CMD", "curl", "-f", "http://ollama:11434"]
|
|
||||||
# interval: 10s
|
|
||||||
# timeout: 5s
|
|
||||||
# retries: 5
|
|
||||||
|
|
||||||
ollama-init:
|
|
||||||
image: ollama/ollama:latest
|
|
||||||
depends_on:
|
|
||||||
- ollama
|
|
||||||
# condition: service_healthy
|
|
||||||
network_mode: ${NETWORK_MODE:-bridge}
|
|
||||||
environment:
|
|
||||||
- OLLAMA_HOST=ollama
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
|
||||||
volumes:
|
|
||||||
- ~/.ollama:/root/.ollama
|
|
||||||
- ./pull-models.sh:/pull-models.sh
|
|
||||||
entrypoint: ["/pull-models.sh"]
|
|
||||||
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
ollama:
|
|
||||||
condition: service_started
|
|
||||||
ollama-init:
|
|
||||||
condition: service_started
|
|
||||||
image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
|
|
||||||
network_mode: ${NETWORK_MODE:-bridge}
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
# Link to ollama run.yaml file
|
|
||||||
- ~/local/llama-stack/:/app/llama-stack-source
|
|
||||||
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
|
||||||
environment:
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
|
||||||
- OLLAMA_URL=http://ollama:11434
|
|
||||||
entrypoint: >
|
|
||||||
python -m llama_stack.distribution.server.server /root/my-run.yaml \
|
|
||||||
--port ${LLAMA_STACK_PORT:-8321}
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 10s
|
|
||||||
max_attempts: 3
|
|
||||||
window: 60s
|
|
||||||
volumes:
|
|
||||||
ollama:
|
|
||||||
ollama-init:
|
|
||||||
llamastack:
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/sh
|
|
||||||
|
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
|
|
||||||
for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
|
|
||||||
echo "Preloading $model..."
|
|
||||||
if ! ollama run "$model"; then
|
|
||||||
echo "Failed to pull and run $model"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "All models pulled successfully"
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/ollama/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/ollama/run.yaml
|
|
Binary file not shown.
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/nvidia/build.yaml
|
|
|
@ -1,19 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: distribution-nvidia:dev
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-nvidia.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
environment:
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
|
||||||
- NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/nvidia/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/remote-vllm/build.yaml
|
|
|
@ -1,99 +0,0 @@
|
||||||
services:
|
|
||||||
vllm-inference:
|
|
||||||
image: vllm/vllm-openai:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
|
||||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
||||||
command: >
|
|
||||||
--gpu-memory-utilization 0.75
|
|
||||||
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
--enforce-eager
|
|
||||||
--max-model-len 8192
|
|
||||||
--max-num-seqs 16
|
|
||||||
--port ${VLLM_INFERENCE_PORT:-5100}
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
# A little trick:
|
|
||||||
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
|
||||||
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
|
||||||
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
||||||
image: vllm/vllm-openai:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
|
||||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
|
||||||
command: >
|
|
||||||
--gpu-memory-utilization 0.75
|
|
||||||
--model ${VLLM_SAFETY_MODEL}
|
|
||||||
--enforce-eager
|
|
||||||
--max-model-len 8192
|
|
||||||
--max-num-seqs 16
|
|
||||||
--port ${VLLM_SAFETY_PORT:-5101}
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 5
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
- vllm-inference:
|
|
||||||
condition: service_healthy
|
|
||||||
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
|
||||||
condition: service_healthy
|
|
||||||
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
environment:
|
|
||||||
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
|
||||||
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
|
||||||
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
||||||
ports:
|
|
||||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
|
||||||
# Hack: wait for vLLM server to start before starting docker
|
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
||||||
volumes:
|
|
||||||
vllm-inference:
|
|
||||||
vllm-safety:
|
|
||||||
llamastack:
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/remote-vllm/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/remote-vllm/run.yaml
|
|
|
@ -1,9 +0,0 @@
|
||||||
name: runpod
|
|
||||||
distribution_spec:
|
|
||||||
description: Use Runpod for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::runpod
|
|
||||||
memory: meta-reference
|
|
||||||
safety: meta-reference
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/sambanova/build.yaml
|
|
|
@ -1,16 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-sambanova
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/llamastack-run-sambanova.yaml
|
|
||||||
ports:
|
|
||||||
- "5000:5000"
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/sambanova/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/tgi/build.yaml
|
|
|
@ -1,103 +0,0 @@
|
||||||
services:
|
|
||||||
tgi-inference:
|
|
||||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/data
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
|
||||||
- HF_TOKEN=$HF_TOKEN
|
|
||||||
- HF_HOME=/data
|
|
||||||
- HF_DATASETS_CACHE=/data
|
|
||||||
- HF_MODULES_CACHE=/data
|
|
||||||
- HF_HUB_CACHE=/data
|
|
||||||
command: >
|
|
||||||
--dtype bfloat16
|
|
||||||
--usage-stats off
|
|
||||||
--sharded false
|
|
||||||
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
--port ${TGI_INFERENCE_PORT:-8080}
|
|
||||||
--cuda-memory-fraction 0.75
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
|
||||||
interval: 5s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 30
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
||||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
|
||||||
volumes:
|
|
||||||
- $HOME/.cache/huggingface:/data
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
ports:
|
|
||||||
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
|
||||||
- HF_TOKEN=$HF_TOKEN
|
|
||||||
- HF_HOME=/data
|
|
||||||
- HF_DATASETS_CACHE=/data
|
|
||||||
- HF_MODULES_CACHE=/data
|
|
||||||
- HF_HUB_CACHE=/data
|
|
||||||
command: >
|
|
||||||
--dtype bfloat16
|
|
||||||
--usage-stats off
|
|
||||||
--sharded false
|
|
||||||
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
||||||
--port ${TGI_SAFETY_PORT:-8081}
|
|
||||||
--cuda-memory-fraction 0.75
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
|
||||||
interval: 5s
|
|
||||||
timeout: 5s
|
|
||||||
retries: 30
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
tgi-inference:
|
|
||||||
condition: service_healthy
|
|
||||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
|
||||||
condition: service_healthy
|
|
||||||
image: llamastack/distribution-tgi:test-0.0.52rc3
|
|
||||||
network_mode: ${NETWORK_MODE:-bridged}
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
|
||||||
# Hack: wait for TGI server to start before starting docker
|
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
||||||
environment:
|
|
||||||
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
|
||||||
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
tgi-inference:
|
|
||||||
tgi-safety:
|
|
||||||
llamastack:
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/tgi/run-with-safety.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/tgi/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/together/build.yaml
|
|
|
@ -1,14 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-together
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
environment:
|
|
||||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY}
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/together/run.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/inline-vllm/build.yaml
|
|
|
@ -1,35 +0,0 @@
|
||||||
services:
|
|
||||||
llamastack:
|
|
||||||
image: llamastack/distribution-inline-vllm
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
- ./run.yaml:/root/my-run.yaml
|
|
||||||
ports:
|
|
||||||
- "8321:8321"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: 1
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
|
@ -1,66 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
container_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: vllm-inference
|
|
||||||
provider_type: inline::vllm
|
|
||||||
config:
|
|
||||||
model: Llama3.2-3B-Instruct
|
|
||||||
tensor_parallel_size: 1
|
|
||||||
gpu_memory_utilization: 0.4
|
|
||||||
enforce_eager: true
|
|
||||||
max_tokens: 4096
|
|
||||||
- provider_id: vllm-inference-safety
|
|
||||||
provider_type: inline::vllm
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
tensor_parallel_size: 1
|
|
||||||
gpu_memory_utilization: 0.2
|
|
||||||
enforce_eager: true
|
|
||||||
max_tokens: 4096
|
|
||||||
safety:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
model: Llama-Guard-3-1B
|
|
||||||
excluded_categories: []
|
|
||||||
# Uncomment to use prompt guard
|
|
||||||
# - provider_id: meta1
|
|
||||||
# provider_type: inline::prompt-guard
|
|
||||||
# config:
|
|
||||||
# model: Prompt-Guard-86M
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
||||||
# Uncomment to use pgvector
|
|
||||||
# - provider_id: pgvector
|
|
||||||
# provider_type: remote::pgvector
|
|
||||||
# config:
|
|
||||||
# host: 127.0.0.1
|
|
||||||
# port: 5432
|
|
||||||
# db: postgres
|
|
||||||
# user: postgres
|
|
||||||
# password: mysecretpassword
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
namespace: null
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/agents_store.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
Loading…
Add table
Add a link
Reference in a new issue