mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-17 08:02:38 +00:00
Convert TGI
This commit is contained in:
parent
9bb07ce298
commit
028530546f
14 changed files with 485 additions and 160 deletions
|
|
@ -1,51 +1,89 @@
|
|||
services:
|
||||
text-generation-inference:
|
||||
tgi-inference:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
network_mode: "host"
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "5009:5009"
|
||||
- "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=0
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
--port ${TGI_INFERENCE_PORT:-8080}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
# that's the closest analogue to --gpus; provide
|
||||
# an integer amount of devices or 'all'
|
||||
count: 1
|
||||
# Devices are reserved using a list of capabilities, making
|
||||
# capabilities the only required field. A device MUST
|
||||
# satisfy all the requested capabilities for a successful
|
||||
# reservation.
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
image: ghcr.io/huggingface/text-generation-inference:latest
|
||||
volumes:
|
||||
- $HOME/.cache/huggingface:/data
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
ports:
|
||||
- "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
|
||||
devices:
|
||||
- nvidia.com/gpu=all
|
||||
environment:
|
||||
- CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
|
||||
- HF_TOKEN=$HF_TOKEN
|
||||
- HF_HOME=/data
|
||||
- HF_DATASETS_CACHE=/data
|
||||
- HF_MODULES_CACHE=/data
|
||||
- HF_HUB_CACHE=/data
|
||||
command: >
|
||||
--dtype bfloat16
|
||||
--usage-stats off
|
||||
--sharded false
|
||||
--model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
--port ${TGI_SAFETY_PORT:-8081}
|
||||
--cuda-memory-fraction 0.75
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
|
||||
test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
capabilities: [gpu]
|
||||
runtime: nvidia
|
||||
|
||||
llamastack:
|
||||
depends_on:
|
||||
text-generation-inference:
|
||||
tgi-inference:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi
|
||||
network_mode: "host"
|
||||
tgi-${TGI_SAFETY_MODEL:+safety}:
|
||||
condition: service_healthy
|
||||
image: llamastack/distribution-tgi:test-0.0.52rc3
|
||||
network_mode: ${NETWORK_MODE:-bridged}
|
||||
volumes:
|
||||
- ~/.llama:/root/.llama
|
||||
# Link to TGI run.yaml file
|
||||
- ./run.yaml:/root/my-run.yaml
|
||||
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||
ports:
|
||||
- "5000:5000"
|
||||
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
||||
# Hack: wait for TGI server to start before starting docker
|
||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||
restart_policy:
|
||||
|
|
@ -53,3 +91,13 @@ services:
|
|||
delay: 3s
|
||||
max_attempts: 5
|
||||
window: 60s
|
||||
environment:
|
||||
- TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
|
||||
- SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
|
||||
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||
|
||||
volumes:
|
||||
tgi-inference:
|
||||
tgi-safety:
|
||||
llamastack:
|
||||
|
|
|
|||
67
distributions/tgi/run-with-safety.yaml
Normal file
67
distributions/tgi/run-with-safety.yaml
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
version: '2'
|
||||
built_at: 2024-11-17 14:48:56.991119
|
||||
image_name: tgi
|
||||
docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
|
||||
conda_env: null
|
||||
apis:
|
||||
- safety
|
||||
- agents
|
||||
- telemetry
|
||||
- memory
|
||||
- inference
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: tgi-inference
|
||||
provider_type: remote::tgi
|
||||
config:
|
||||
url: ${env.TGI_URL}
|
||||
- provider_id: tgi-safety
|
||||
provider_type: remote::tgi
|
||||
config:
|
||||
url: ${env.SAFETY_TGI_URL}
|
||||
memory:
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/faiss_store.db
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/agents_store.db
|
||||
telemetry:
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
metadata_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/registry.db
|
||||
models:
|
||||
- metadata: {}
|
||||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: tgi-inference
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: ${env.SAFETY_MODEL}
|
||||
provider_id: tgi-safety
|
||||
provider_model_id: null
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: ${env.SAFETY_MODEL}
|
||||
provider_id: null
|
||||
provider_shield_id: null
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
|
|
@ -1,45 +1,55 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
built_at: 2024-11-17 14:48:56.975663
|
||||
image_name: tgi
|
||||
docker_image: llamastack/distribution-remote-tgi:test-0.0.52rc3
|
||||
conda_env: null
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
- agents
|
||||
- telemetry
|
||||
- memory
|
||||
- inference
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: tgi0
|
||||
- provider_id: tgi-inference
|
||||
provider_type: remote::tgi
|
||||
config:
|
||||
url: http://127.0.0.1:5009
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
url: ${env.TGI_URL}
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/faiss_store.db
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/agents_store.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
metadata_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/registry.db
|
||||
models:
|
||||
- metadata: {}
|
||||
model_id: ${env.INFERENCE_MODEL}
|
||||
provider_id: tgi-inference
|
||||
provider_model_id: null
|
||||
shields: []
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue