mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-01 16:24:44 +00:00
Adding docker-compose.yaml, starting to simplify
This commit is contained in:
parent
e4509cb568
commit
f38e76ee98
14 changed files with 516 additions and 386 deletions
|
@ -9,25 +9,30 @@
|
||||||
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
|
# Similarly change "host.docker.internal" to "localhost" in the run.yaml file
|
||||||
#
|
#
|
||||||
services:
|
services:
|
||||||
vllm-0:
|
vllm-inference:
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
volumes:
|
volumes:
|
||||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||||
# network_mode: "host"
|
network_mode: ${NETWORK_MODE:-bridged}
|
||||||
ports:
|
ports:
|
||||||
- "5100:5100"
|
- "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
|
||||||
devices:
|
devices:
|
||||||
- nvidia.com/gpu=all
|
- nvidia.com/gpu=all
|
||||||
environment:
|
environment:
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
- CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
|
||||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||||
command: >
|
command: >
|
||||||
--gpu-memory-utilization 0.75
|
--gpu-memory-utilization 0.75
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct
|
--model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||||
--enforce-eager
|
--enforce-eager
|
||||||
--max-model-len 8192
|
--max-model-len 8192
|
||||||
--max-num-seqs 16
|
--max-num-seqs 16
|
||||||
--port 5100
|
--port ${VLLM_INFERENCE_PORT:-5100}
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
|
@ -35,25 +40,34 @@ services:
|
||||||
- driver: nvidia
|
- driver: nvidia
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
runtime: nvidia
|
runtime: nvidia
|
||||||
vllm-1:
|
|
||||||
|
# A little trick:
|
||||||
|
# if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
|
||||||
|
# otherwise, the entry will end in a hyphen which gets ignored by docker compose
|
||||||
|
vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
volumes:
|
volumes:
|
||||||
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||||
# network_mode: "host"
|
network_mode: ${NETWORK_MODE:-bridged}
|
||||||
ports:
|
ports:
|
||||||
- "5101:5101"
|
- "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
|
||||||
devices:
|
devices:
|
||||||
- nvidia.com/gpu=all
|
- nvidia.com/gpu=all
|
||||||
environment:
|
environment:
|
||||||
- CUDA_VISIBLE_DEVICES=1
|
- CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
|
||||||
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
- HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
|
||||||
command: >
|
command: >
|
||||||
--gpu-memory-utilization 0.75
|
--gpu-memory-utilization 0.75
|
||||||
--model meta-llama/Llama-Guard-3-1B
|
--model ${VLLM_SAFETY_MODEL}
|
||||||
--enforce-eager
|
--enforce-eager
|
||||||
--max-model-len 8192
|
--max-model-len 8192
|
||||||
--max-num-seqs 16
|
--max-num-seqs 16
|
||||||
--port 5101
|
--port ${VLLM_SAFETY_PORT:-5101}
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 5
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
|
@ -63,23 +77,25 @@ services:
|
||||||
runtime: nvidia
|
runtime: nvidia
|
||||||
llamastack:
|
llamastack:
|
||||||
depends_on:
|
depends_on:
|
||||||
- vllm-0
|
- vllm-inference:
|
||||||
- vllm-1
|
condition: service_healthy
|
||||||
# image: llamastack/distribution-remote-vllm
|
- vllm-${VLLM_SAFETY_MODEL:+safety}:
|
||||||
|
condition: service_healthy
|
||||||
|
# image: llamastack/distribution-remote-vllm
|
||||||
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
||||||
volumes:
|
volumes:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml
|
- ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
|
||||||
# network_mode: "host"
|
network_mode: ${NETWORK_MODE:-bridged}
|
||||||
environment:
|
environment:
|
||||||
- LLAMA_INFERENCE_VLLM_URL=${LLAMA_INFERENCE_VLLM_URL:-http://host.docker.internal:5100/v1}
|
- VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
|
||||||
- LLAMA_INFERENCE_MODEL=${LLAMA_INFERENCE_MODEL:-Llama3.1-8B-Instruct}
|
- VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
|
||||||
|
- INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||||
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
- MAX_TOKENS=${MAX_TOKENS:-4096}
|
||||||
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
||||||
- LLAMA_SAFETY_VLLM_URL=${LLAMA_SAFETY_VLLM_URL:-http://host.docker.internal:5101/v1}
|
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||||
- LLAMA_SAFETY_MODEL=${LLAMA_SAFETY_MODEL:-Llama-Guard-3-1B}
|
|
||||||
ports:
|
ports:
|
||||||
- "5001:5001"
|
- "${LLAMASTACK_PORT:-5001}:${LLAMASTACK_PORT:-5001}"
|
||||||
# Hack: wait for vLLM server to start before starting docker
|
# Hack: wait for vLLM server to start before starting docker
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
|
||||||
deploy:
|
deploy:
|
||||||
|
@ -89,6 +105,6 @@ services:
|
||||||
max_attempts: 5
|
max_attempts: 5
|
||||||
window: 60s
|
window: 60s
|
||||||
volumes:
|
volumes:
|
||||||
vllm-0:
|
vllm-inference:
|
||||||
vllm-1:
|
vllm-safety:
|
||||||
llamastack:
|
llamastack:
|
||||||
|
|
68
distributions/remote-vllm/run-with-safety.yaml
Normal file
68
distributions/remote-vllm/run-with-safety.yaml
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
version: '2'
|
||||||
|
built_at: '2024-11-11T20:09:45.988375'
|
||||||
|
image_name: remote-vllm
|
||||||
|
docker_image: remote-vllm
|
||||||
|
conda_env: null
|
||||||
|
apis:
|
||||||
|
- inference
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- agents
|
||||||
|
- telemetry
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
# serves main inference model
|
||||||
|
- provider_id: vllm-inference
|
||||||
|
provider_type: remote::vllm
|
||||||
|
config:
|
||||||
|
# NOTE: replace with "localhost" if you are running in "host" network mode
|
||||||
|
url: ${env.VLLM_URL}
|
||||||
|
max_tokens: ${env.MAX_TOKENS:4096}
|
||||||
|
api_token: fake
|
||||||
|
# serves safety llama_guard model
|
||||||
|
- provider_id: vllm-safety
|
||||||
|
provider_type: remote::vllm
|
||||||
|
config:
|
||||||
|
# NOTE: replace with "localhost" if you are running in "host" network mode
|
||||||
|
url: ${env.SAFETY_VLLM_URL}
|
||||||
|
max_tokens: ${env.MAX_TOKENS:4096}
|
||||||
|
api_token: fake
|
||||||
|
memory:
|
||||||
|
- provider_id: faiss-0
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config: {}
|
||||||
|
memory:
|
||||||
|
- provider_id: meta0
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config: {}
|
||||||
|
agents:
|
||||||
|
- provider_id: meta0
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db"
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta0
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
|
||||||
|
models:
|
||||||
|
- model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: vllm-inference
|
||||||
|
- model_id: ${env.SAFETY_MODEL}
|
||||||
|
provider_id: vllm-safety
|
||||||
|
shields:
|
||||||
|
- shield_id: ${env.SAFETY_MODEL}
|
|
@ -6,39 +6,25 @@ conda_env: null
|
||||||
apis:
|
apis:
|
||||||
- inference
|
- inference
|
||||||
- memory
|
- memory
|
||||||
- safety
|
|
||||||
- agents
|
- agents
|
||||||
- telemetry
|
- telemetry
|
||||||
providers:
|
providers:
|
||||||
inference:
|
inference:
|
||||||
# serves main inference model
|
# serves main inference model
|
||||||
- provider_id: vllm-0
|
- provider_id: vllm-inference
|
||||||
provider_type: remote::vllm
|
provider_type: remote::vllm
|
||||||
config:
|
config:
|
||||||
# NOTE: replace with "localhost" if you are running in "host" network mode
|
url: ${env.VLLM_URL}
|
||||||
url: ${env.VLLM_URL:http://host.docker.internal:5100/v1}
|
|
||||||
max_tokens: ${env.MAX_TOKENS:4096}
|
|
||||||
api_token: fake
|
|
||||||
# serves safety llama_guard model
|
|
||||||
- provider_id: vllm-1
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
# NOTE: replace with "localhost" if you are running in "host" network mode
|
|
||||||
url: ${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}
|
|
||||||
max_tokens: ${env.MAX_TOKENS:4096}
|
max_tokens: ${env.MAX_TOKENS:4096}
|
||||||
api_token: fake
|
api_token: fake
|
||||||
memory:
|
memory:
|
||||||
- provider_id: faiss-0
|
- provider_id: faiss
|
||||||
provider_type: inline::faiss
|
provider_type: inline::faiss
|
||||||
config:
|
config:
|
||||||
kvstore:
|
kvstore:
|
||||||
namespace: null
|
namespace: null
|
||||||
type: sqlite
|
type: sqlite
|
||||||
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
|
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/faiss_store.db"
|
||||||
safety:
|
|
||||||
- provider_id: llama-guard
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config: {}
|
|
||||||
memory:
|
memory:
|
||||||
- provider_id: meta0
|
- provider_id: meta0
|
||||||
provider_type: inline::faiss
|
provider_type: inline::faiss
|
||||||
|
@ -60,9 +46,5 @@ metadata_store:
|
||||||
type: sqlite
|
type: sqlite
|
||||||
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
|
db_path: "${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/registry.db"
|
||||||
models:
|
models:
|
||||||
- model_id: ${env.INFERENCE_MODEL:Llama3.1-8B-Instruct}
|
- model_id: ${env.INFERENCE_MODEL}
|
||||||
provider_id: vllm-0
|
provider_id: vllm-inference
|
||||||
- model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
|
|
||||||
provider_id: vllm-1
|
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
|
|
||||||
|
|
|
@ -4,37 +4,10 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from typing import Optional
|
from .config import OllamaImplConfig
|
||||||
|
|
||||||
from llama_stack.distribution.datatypes import RemoteProviderConfig
|
|
||||||
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_OLLAMA_PORT = 11434
|
async def get_adapter_impl(config: OllamaImplConfig, _deps):
|
||||||
|
|
||||||
|
|
||||||
class OllamaImplConfig(RemoteProviderConfig):
|
|
||||||
port: int = DEFAULT_OLLAMA_PORT
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def sample_docker_compose_config(cls) -> Optional[DockerComposeServiceConfig]:
|
|
||||||
return DockerComposeServiceConfig(
|
|
||||||
image="ollama/ollama:latest",
|
|
||||||
volumes=["$HOME/.ollama:/root/.ollama"],
|
|
||||||
devices=["nvidia.com/gpu=all"],
|
|
||||||
deploy={
|
|
||||||
"resources": {
|
|
||||||
"reservations": {
|
|
||||||
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
runtime="nvidia",
|
|
||||||
ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def get_adapter_impl(config: RemoteProviderConfig, _deps):
|
|
||||||
from .ollama import OllamaInferenceAdapter
|
from .ollama import OllamaInferenceAdapter
|
||||||
|
|
||||||
impl = OllamaInferenceAdapter(config.url)
|
impl = OllamaInferenceAdapter(config.url)
|
||||||
|
|
65
llama_stack/providers/remote/inference/ollama/config.py
Normal file
65
llama_stack/providers/remote/inference/ollama/config.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from llama_stack.distribution.datatypes import RemoteProviderConfig
|
||||||
|
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_OLLAMA_PORT = 11434
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaImplConfig(RemoteProviderConfig):
|
||||||
|
port: int = DEFAULT_OLLAMA_PORT
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def sample_docker_compose_services(cls) -> List[DockerComposeServiceConfig]:
|
||||||
|
return [
|
||||||
|
DockerComposeServiceConfig(
|
||||||
|
service_name="ollama",
|
||||||
|
image="ollama/ollama:latest",
|
||||||
|
volumes=["$HOME/.ollama:/root/.ollama"],
|
||||||
|
devices=["nvidia.com/gpu=all"],
|
||||||
|
deploy={
|
||||||
|
"resources": {
|
||||||
|
"reservations": {
|
||||||
|
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
runtime="nvidia",
|
||||||
|
ports=[f"{DEFAULT_OLLAMA_PORT}:{DEFAULT_OLLAMA_PORT}"],
|
||||||
|
healthcheck={
|
||||||
|
"test": ["CMD", "curl", "-f", "http://ollama:11434"],
|
||||||
|
"interval": "10s",
|
||||||
|
"timeout": "5s",
|
||||||
|
"retries": 5,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
DockerComposeServiceConfig(
|
||||||
|
service_name="ollama-init",
|
||||||
|
image="ollama/ollama",
|
||||||
|
depends_on={"ollama": {"condition": "service_healthy"}},
|
||||||
|
environment={
|
||||||
|
"OLLAMA_HOST": "ollama",
|
||||||
|
"OLLAMA_MODELS": "${OLLAMA_MODELS}",
|
||||||
|
},
|
||||||
|
volumes=["ollama_data:/root/.ollama"],
|
||||||
|
entrypoint=(
|
||||||
|
'sh -c \'max_attempts=30;attempt=0;echo "Waiting for Ollama server...";'
|
||||||
|
"until curl -s http://ollama:11434 > /dev/null; do"
|
||||||
|
"attempt=$((attempt + 1));"
|
||||||
|
"if [ $attempt -ge $max_attempts ]; then"
|
||||||
|
'echo "Timeout waiting for Ollama server";'
|
||||||
|
"exit 1;"
|
||||||
|
"fi;"
|
||||||
|
'echo "Attempt $attempt: Server not ready yet...";'
|
||||||
|
"sleep 5;"
|
||||||
|
"done'"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
|
@ -0,0 +1,55 @@
|
||||||
|
services:
|
||||||
|
${SERVICE_NAME:-ollama}:
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
ports:
|
||||||
|
- ${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434}
|
||||||
|
volumes:
|
||||||
|
- $HOME/.ollama:/root/.ollama
|
||||||
|
devices:
|
||||||
|
- nvidia.com/gpu=all
|
||||||
|
runtime: nvidia
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://ollama:11434"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
${SERVICE_NAME:-ollama}-init:
|
||||||
|
image: ollama/ollama
|
||||||
|
depends_on:
|
||||||
|
- ${SERVICE_NAME:-ollama}:
|
||||||
|
condition: service_healthy
|
||||||
|
environment:
|
||||||
|
- OLLAMA_HOST=ollama
|
||||||
|
- OLLAMA_MODELS=${OLLAMA_MODELS}
|
||||||
|
volumes:
|
||||||
|
- $HOME/.ollama:/root/.ollama
|
||||||
|
entrypoint: >
|
||||||
|
sh -c '
|
||||||
|
max_attempts=30;
|
||||||
|
attempt=0;
|
||||||
|
|
||||||
|
echo "Waiting for Ollama server...";
|
||||||
|
until curl -s http://ollama:11434 > /dev/null; do
|
||||||
|
attempt=$((attempt + 1));
|
||||||
|
if [ $attempt -ge $max_attempts ]; then
|
||||||
|
echo "Timeout waiting for Ollama server";
|
||||||
|
exit 1;
|
||||||
|
fi;
|
||||||
|
echo "Attempt $attempt: Server not ready yet...";
|
||||||
|
sleep 5;
|
||||||
|
done;
|
||||||
|
|
||||||
|
echo "Server ready. Pulling models...";
|
||||||
|
|
||||||
|
models="${OLLAMA_MODELS}";
|
||||||
|
for model in $models; do
|
||||||
|
echo "Pulling $model...";
|
||||||
|
if ! ollama pull "$model"; then
|
||||||
|
echo "Failed to pull $model";
|
||||||
|
exit 1;
|
||||||
|
fi;
|
||||||
|
done;
|
||||||
|
|
||||||
|
echo "All models pulled successfully"
|
||||||
|
'
|
|
@ -0,0 +1,35 @@
|
||||||
|
services:
|
||||||
|
${SERVICE_NAME:-tgi}:
|
||||||
|
image: ghcr.io/huggingface/text-generation-inference:2.3.1
|
||||||
|
network_mode: "host"
|
||||||
|
volumes:
|
||||||
|
- $HOME/.cache/huggingface:/data
|
||||||
|
ports:
|
||||||
|
- ${TGI_PORT:-8000}:${TGI_PORT:-8000}
|
||||||
|
devices:
|
||||||
|
- nvidia.com/gpu=all
|
||||||
|
environment:
|
||||||
|
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||||
|
- HF_HOME=/data
|
||||||
|
- HF_DATASETS_CACHE=/data
|
||||||
|
- HF_MODULES_CACHE=/data
|
||||||
|
- HF_HUB_CACHE=/data
|
||||||
|
command: >
|
||||||
|
--dtype bfloat16
|
||||||
|
--usage-stats off
|
||||||
|
--sharded false
|
||||||
|
--model-id ${TGI_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||||
|
--port ${TGI_PORT:-8000}
|
||||||
|
--cuda-memory-fraction ${TGI_CUDA_MEMORY_FRACTION:-0.8}
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
capabilities: [gpu]
|
||||||
|
runtime: nvidia
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://${SERVICE_NAME:-tgi}:${TGI_PORT:-8000}/health"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 30
|
|
@ -9,11 +9,6 @@ from typing import Optional
|
||||||
from llama_models.schema_utils import json_schema_type
|
from llama_models.schema_utils import json_schema_type
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_VLLM_PORT = 8000
|
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class VLLMInferenceAdapterConfig(BaseModel):
|
class VLLMInferenceAdapterConfig(BaseModel):
|
||||||
|
@ -33,48 +28,10 @@ class VLLMInferenceAdapterConfig(BaseModel):
|
||||||
@classmethod
|
@classmethod
|
||||||
def sample_run_config(
|
def sample_run_config(
|
||||||
cls,
|
cls,
|
||||||
url: str = "${env.VLLM_URL:http://host.docker.internal:5100/v1}",
|
url: str = "${env.VLLM_URL}",
|
||||||
):
|
):
|
||||||
return {
|
return {
|
||||||
"url": url,
|
"url": url,
|
||||||
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
|
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
|
||||||
"api_token": "${env.VLLM_API_TOKEN:fake}",
|
"api_token": "${env.VLLM_API_TOKEN:fake}",
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def sample_docker_compose_config(
|
|
||||||
cls,
|
|
||||||
port: int = DEFAULT_VLLM_PORT,
|
|
||||||
cuda_visible_devices: str = "0",
|
|
||||||
model: str = "meta-llama/Llama-3.2-3B-Instruct",
|
|
||||||
) -> Optional[DockerComposeServiceConfig]:
|
|
||||||
return DockerComposeServiceConfig(
|
|
||||||
image="vllm/vllm-openai:latest",
|
|
||||||
volumes=["$HOME/.cache/huggingface:/root/.cache/huggingface"],
|
|
||||||
devices=["nvidia.com/gpu=all"],
|
|
||||||
deploy={
|
|
||||||
"resources": {
|
|
||||||
"reservations": {
|
|
||||||
"devices": [{"driver": "nvidia", "capabilities": ["gpu"]}]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
runtime="nvidia",
|
|
||||||
ports=[f"{port}:{port}"],
|
|
||||||
environment={
|
|
||||||
"CUDA_VISIBLE_DEVICES": cuda_visible_devices,
|
|
||||||
"HUGGING_FACE_HUB_TOKEN": "$HF_TOKEN",
|
|
||||||
},
|
|
||||||
command=(
|
|
||||||
" ".join(
|
|
||||||
[
|
|
||||||
"--gpu-memory-utilization 0.75",
|
|
||||||
f"--model {model}",
|
|
||||||
"--enforce-eager",
|
|
||||||
"--max-model-len 8192",
|
|
||||||
"--max-num-seqs 16",
|
|
||||||
f"--port {port}",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
services:
|
||||||
|
${SERVICE_NAME:-vllm}:
|
||||||
|
image: vllm/vllm-openai:latest
|
||||||
|
ports:
|
||||||
|
- ${VLLM_PORT:-5100}:${VLLM_PORT:-5100}
|
||||||
|
volumes:
|
||||||
|
- $HOME/.cache/huggingface:/root/.cache/huggingface
|
||||||
|
devices:
|
||||||
|
- nvidia.com/gpu=all
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
capabilities: [gpu]
|
||||||
|
runtime: nvidia
|
||||||
|
environment:
|
||||||
|
- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}
|
||||||
|
- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
|
||||||
|
command: >
|
||||||
|
--gpu-memory-utilization 0.75
|
||||||
|
--model ${VLLM_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
|
||||||
|
--enforce-eager
|
||||||
|
--max-model-len 8192
|
||||||
|
--max-num-seqs 16
|
||||||
|
--port ${VLLM_PORT:-5100}
|
|
@ -1,5 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
|
@ -1,29 +0,0 @@
|
||||||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
||||||
# All rights reserved.
|
|
||||||
#
|
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
|
||||||
# the root directory of this source tree.
|
|
||||||
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
|
||||||
class LiteralString(str):
|
|
||||||
pass # Marker class for strings we want to format with >
|
|
||||||
|
|
||||||
|
|
||||||
class DockerComposeServiceConfig(BaseModel):
|
|
||||||
"""Configuration for a single service in docker-compose."""
|
|
||||||
|
|
||||||
image: str
|
|
||||||
volumes: Optional[List[str]] = None
|
|
||||||
network_mode: str = "bridge"
|
|
||||||
ports: Optional[List[str]] = None
|
|
||||||
devices: Optional[List[str]] = None
|
|
||||||
environment: Optional[Dict[str, str]] = None
|
|
||||||
command: Optional[str] = None
|
|
||||||
depends_on: Optional[List[str]] = None
|
|
||||||
deploy: Optional[Dict[str, Any]] = None
|
|
||||||
runtime: Optional[str] = None
|
|
||||||
entrypoint: Optional[str] = None
|
|
|
@ -54,11 +54,11 @@ class SqliteKVStoreConfig(CommonConfig):
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def sample_run_config(cls, db_name: str = "kvstore.db"):
|
def sample_run_config(cls, dir: str = "runtime", db_name: str = "kvstore.db"):
|
||||||
return {
|
return {
|
||||||
"type": "sqlite",
|
"type": "sqlite",
|
||||||
"namespace": None,
|
"namespace": None,
|
||||||
"db_path": "${env.SQLITE_STORE_DIR:~/.llama/runtime/" + db_name + "}",
|
"db_path": "${env.SQLITE_STORE_DIR:~/.llama/" + f"{dir}/{db_name}" + "}",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
95
llama_stack/templates/remote-vllm/doc_template.md
Normal file
95
llama_stack/templates/remote-vllm/doc_template.md
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
# Remote vLLM Distribution
|
||||||
|
|
||||||
|
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
|
||||||
|
|
||||||
|
{{ providers_table }}
|
||||||
|
|
||||||
|
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
|
||||||
|
|
||||||
|
{%- if docker_compose_env_vars %}
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
The following environment variables can be configured:
|
||||||
|
|
||||||
|
{% for var, (default_value, description) in docker_compose_env_vars.items() %}
|
||||||
|
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if default_models %}
|
||||||
|
### Models
|
||||||
|
|
||||||
|
The following models are configured by default:
|
||||||
|
{% for model in default_models %}
|
||||||
|
- `{{ model.model_id }}`
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
## Using Docker Compose
|
||||||
|
|
||||||
|
You can use `docker compose` to start a vLLM container and Llama Stack server container together.
|
||||||
|
```bash
|
||||||
|
$ cd distributions/{{ name }}; docker compose up
|
||||||
|
```
|
||||||
|
|
||||||
|
You will see outputs similar to following ---
|
||||||
|
```
|
||||||
|
<TO BE FILLED>
|
||||||
|
```
|
||||||
|
|
||||||
|
To kill the server
|
||||||
|
```bash
|
||||||
|
docker compose down
|
||||||
|
```
|
||||||
|
|
||||||
|
## Starting vLLM and Llama Stack separately
|
||||||
|
|
||||||
|
You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
|
||||||
|
|
||||||
|
#### Start vLLM server.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --runtime nvidia --gpus all \
|
||||||
|
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||||
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||||
|
-p 8000:8000 \
|
||||||
|
--ipc=host \
|
||||||
|
vllm/vllm-openai:latest \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
```
|
||||||
|
|
||||||
|
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
|
||||||
|
|
||||||
|
|
||||||
|
#### Start Llama Stack server pointing to your vLLM server
|
||||||
|
|
||||||
|
|
||||||
|
We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
|
||||||
|
```yaml
|
||||||
|
inference:
|
||||||
|
- provider_id: vllm0
|
||||||
|
provider_type: remote::vllm
|
||||||
|
config:
|
||||||
|
url: http://127.0.0.1:8000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Via Conda**
|
||||||
|
|
||||||
|
If you are using Conda, you can build and run the Llama Stack server with the following commands:
|
||||||
|
```bash
|
||||||
|
cd distributions/remote-vllm
|
||||||
|
llama stack build --template remote_vllm --image-type conda
|
||||||
|
llama stack run run.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
**Via Docker**
|
||||||
|
|
||||||
|
You can use the Llama Stack Docker image to start the server with the following command:
|
||||||
|
```bash
|
||||||
|
docker run --network host -it -p 5000:5000 \
|
||||||
|
-v ~/.llama:/root/.llama \
|
||||||
|
-v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
|
||||||
|
--gpus=all \
|
||||||
|
llamastack/distribution-remote-vllm \
|
||||||
|
--yaml_config /root/llamastack-run-remote-vllm.yaml
|
||||||
|
```
|
|
@ -9,7 +9,7 @@ from datetime import datetime
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
from typing import Dict, List, Optional, Set, Tuple
|
||||||
|
|
||||||
import jinja2
|
import jinja2
|
||||||
import yaml
|
import yaml
|
||||||
|
@ -22,7 +22,6 @@ from llama_stack.distribution.datatypes import (
|
||||||
Api,
|
Api,
|
||||||
BuildConfig,
|
BuildConfig,
|
||||||
DistributionSpec,
|
DistributionSpec,
|
||||||
KVStoreConfig,
|
|
||||||
ModelInput,
|
ModelInput,
|
||||||
Provider,
|
Provider,
|
||||||
ShieldInput,
|
ShieldInput,
|
||||||
|
@ -33,53 +32,26 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type
|
||||||
from llama_stack.providers.remote.inference.vllm.config import (
|
from llama_stack.providers.remote.inference.vllm.config import (
|
||||||
VLLMInferenceAdapterConfig,
|
VLLMInferenceAdapterConfig,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.docker.service_config import DockerComposeServiceConfig
|
from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
|
||||||
|
|
||||||
|
|
||||||
class DistributionTemplate(BaseModel):
|
class RunConfigSettings(BaseModel):
|
||||||
"""
|
provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
|
||||||
Represents a Llama Stack distribution instance that can generate configuration
|
|
||||||
and documentation files.
|
|
||||||
"""
|
|
||||||
|
|
||||||
name: str
|
|
||||||
description: str
|
|
||||||
providers: Dict[str, List[str]]
|
|
||||||
run_config_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
|
|
||||||
compose_config_overrides: Dict[str, Dict[str, DockerComposeServiceConfig]] = Field(
|
|
||||||
default_factory=dict
|
|
||||||
)
|
|
||||||
|
|
||||||
default_models: List[ModelInput]
|
default_models: List[ModelInput]
|
||||||
default_shields: Optional[List[ShieldInput]] = None
|
default_shields: Optional[List[ShieldInput]] = None
|
||||||
|
|
||||||
# Optional configuration
|
def run_config(
|
||||||
metadata_store: Optional[KVStoreConfig] = None
|
self,
|
||||||
docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
|
name: str,
|
||||||
docker_image: Optional[str] = None
|
providers: Dict[str, List[str]],
|
||||||
|
docker_image: Optional[str] = None,
|
||||||
@property
|
) -> StackRunConfig:
|
||||||
def distribution_spec(self) -> DistributionSpec:
|
|
||||||
return DistributionSpec(
|
|
||||||
description=self.description,
|
|
||||||
docker_image=self.docker_image,
|
|
||||||
providers=self.providers,
|
|
||||||
)
|
|
||||||
|
|
||||||
def build_config(self) -> BuildConfig:
|
|
||||||
return BuildConfig(
|
|
||||||
name=self.name,
|
|
||||||
distribution_spec=self.distribution_spec,
|
|
||||||
image_type="conda", # default to conda, can be overridden
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_config(self) -> StackRunConfig:
|
|
||||||
provider_registry = get_provider_registry()
|
provider_registry = get_provider_registry()
|
||||||
|
|
||||||
provider_configs = {}
|
provider_configs = {}
|
||||||
for api_str, provider_types in self.providers.items():
|
for api_str, provider_types in providers.items():
|
||||||
if providers := self.run_config_overrides.get(api_str):
|
if api_providers := self.provider_overrides.get(api_str):
|
||||||
provider_configs[api_str] = providers
|
provider_configs[api_str] = api_providers
|
||||||
continue
|
continue
|
||||||
|
|
||||||
provider_type = provider_types[0]
|
provider_type = provider_types[0]
|
||||||
|
@ -111,83 +83,53 @@ class DistributionTemplate(BaseModel):
|
||||||
]
|
]
|
||||||
|
|
||||||
# Get unique set of APIs from providers
|
# Get unique set of APIs from providers
|
||||||
apis: Set[str] = set(self.providers.keys())
|
apis: Set[str] = set(providers.keys())
|
||||||
|
|
||||||
return StackRunConfig(
|
return StackRunConfig(
|
||||||
image_name=self.name,
|
image_name=name,
|
||||||
docker_image=self.docker_image,
|
docker_image=docker_image,
|
||||||
built_at=datetime.now(),
|
built_at=datetime.now(),
|
||||||
apis=list(apis),
|
apis=list(apis),
|
||||||
providers=provider_configs,
|
providers=provider_configs,
|
||||||
metadata_store=self.metadata_store,
|
metadata_store=SqliteKVStoreConfig.sample_run_config(
|
||||||
|
dir=f"distributions/{name}",
|
||||||
|
db_name="registry.db",
|
||||||
|
),
|
||||||
models=self.default_models,
|
models=self.default_models,
|
||||||
shields=self.default_shields or [],
|
shields=self.default_shields or [],
|
||||||
)
|
)
|
||||||
|
|
||||||
def docker_compose_config(self) -> Dict[str, Any]:
|
|
||||||
services = {}
|
|
||||||
provider_registry = get_provider_registry()
|
|
||||||
|
|
||||||
# Add provider services based on their sample_compose_config
|
class DistributionTemplate(BaseModel):
|
||||||
for api_str, api_providers in self.providers.items():
|
"""
|
||||||
if overrides := self.compose_config_overrides.get(api_str):
|
Represents a Llama Stack distribution instance that can generate configuration
|
||||||
services |= overrides
|
and documentation files.
|
||||||
continue
|
"""
|
||||||
|
|
||||||
# only look at the first provider to get the compose config for now
|
name: str
|
||||||
# we may want to use `docker compose profiles` in the future
|
description: str
|
||||||
provider_type = api_providers[0]
|
|
||||||
provider_id = provider_type.split("::")[-1]
|
|
||||||
api = Api(api_str)
|
|
||||||
if provider_type not in provider_registry[api]:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unknown provider type: {provider_type} for API: {api_str}"
|
|
||||||
)
|
|
||||||
|
|
||||||
config_class = provider_registry[api][provider_type].config_class
|
providers: Dict[str, List[str]]
|
||||||
assert (
|
run_configs: Dict[str, RunConfigSettings]
|
||||||
config_class is not None
|
template_path: Path
|
||||||
), f"No config class for provider type: {provider_type} for API: {api_str}"
|
|
||||||
|
|
||||||
config_class = instantiate_class_type(config_class)
|
# Optional configuration
|
||||||
if not hasattr(config_class, "sample_docker_compose_config"):
|
docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
|
||||||
continue
|
docker_image: Optional[str] = None
|
||||||
|
|
||||||
compose_config = config_class.sample_docker_compose_config()
|
default_models: Optional[List[ModelInput]] = None
|
||||||
services[provider_id] = compose_config
|
|
||||||
|
|
||||||
port = "${LLAMASTACK_PORT:-5001}"
|
def build_config(self) -> BuildConfig:
|
||||||
# Add main llamastack service
|
return BuildConfig(
|
||||||
llamastack_config = DockerComposeServiceConfig(
|
name=self.name,
|
||||||
image=f"llamastack/distribution-{self.name}:latest",
|
distribution_spec=DistributionSpec(
|
||||||
depends_on=list(services.keys()),
|
description=self.description,
|
||||||
volumes=[
|
docker_image=self.docker_image,
|
||||||
"~/.llama:/root/.llama",
|
providers=self.providers,
|
||||||
f"~/local/llama-stack/distributions/{self.name}/run.yaml:/root/llamastack-run-{self.name}.yaml",
|
|
||||||
],
|
|
||||||
ports=[f"{port}:{port}"],
|
|
||||||
environment={
|
|
||||||
k: v[0] for k, v in (self.docker_compose_env_vars or {}).items()
|
|
||||||
},
|
|
||||||
entrypoint=(
|
|
||||||
f'bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-{self.name}.yaml --port {port}"'
|
|
||||||
),
|
),
|
||||||
deploy={
|
image_type="conda", # default to conda, can be overridden
|
||||||
"restart_policy": {
|
|
||||||
"condition": "on-failure",
|
|
||||||
"delay": "3s",
|
|
||||||
"max_attempts": 5,
|
|
||||||
"window": "60s",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
services["llamastack"] = llamastack_config
|
|
||||||
return {
|
|
||||||
"services": {k: v.model_dump() for k, v in services.items()},
|
|
||||||
"volumes": {service_name: None for service_name in services.keys()},
|
|
||||||
}
|
|
||||||
|
|
||||||
def generate_markdown_docs(self) -> str:
|
def generate_markdown_docs(self) -> str:
|
||||||
"""Generate markdown documentation using both Jinja2 templates and rich tables."""
|
"""Generate markdown documentation using both Jinja2 templates and rich tables."""
|
||||||
# First generate the providers table using rich
|
# First generate the providers table using rich
|
||||||
|
@ -204,53 +146,7 @@ class DistributionTemplate(BaseModel):
|
||||||
console.print(table)
|
console.print(table)
|
||||||
providers_table = output.getvalue()
|
providers_table = output.getvalue()
|
||||||
|
|
||||||
# Main documentation template
|
template = self.template_path.read_text()
|
||||||
template = """# {{ name }} Distribution
|
|
||||||
|
|
||||||
{{ description }}
|
|
||||||
|
|
||||||
## Provider Configuration
|
|
||||||
|
|
||||||
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
|
|
||||||
|
|
||||||
{{ providers_table }}
|
|
||||||
|
|
||||||
{%- if env_vars %}
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
The following environment variables can be configured:
|
|
||||||
|
|
||||||
{% for var, (value, description) in docker_compose_env_vars.items() %}
|
|
||||||
- `{{ var }}`: {{ description }}
|
|
||||||
{% endfor %}
|
|
||||||
{%- endif %}
|
|
||||||
|
|
||||||
## Example Usage
|
|
||||||
|
|
||||||
### Using Docker Compose
|
|
||||||
|
|
||||||
```bash
|
|
||||||
$ cd distributions/{{ name }}
|
|
||||||
$ docker compose up
|
|
||||||
```
|
|
||||||
|
|
||||||
## Models
|
|
||||||
|
|
||||||
The following models are configured by default:
|
|
||||||
{% for model in default_models %}
|
|
||||||
- `{{ model.model_id }}`
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
{%- if default_shields %}
|
|
||||||
|
|
||||||
## Safety Shields
|
|
||||||
|
|
||||||
The following safety shields are configured:
|
|
||||||
{% for shield in default_shields %}
|
|
||||||
- `{{ shield.shield_id }}`
|
|
||||||
{%- endfor %}
|
|
||||||
{%- endif %}
|
|
||||||
"""
|
|
||||||
# Render template with rich-generated table
|
# Render template with rich-generated table
|
||||||
env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
|
env = jinja2.Environment(trim_blocks=True, lstrip_blocks=True)
|
||||||
template = env.from_string(template)
|
template = env.from_string(template)
|
||||||
|
@ -261,7 +157,6 @@ The following safety shields are configured:
|
||||||
providers_table=providers_table,
|
providers_table=providers_table,
|
||||||
docker_compose_env_vars=self.docker_compose_env_vars,
|
docker_compose_env_vars=self.docker_compose_env_vars,
|
||||||
default_models=self.default_models,
|
default_models=self.default_models,
|
||||||
default_shields=self.default_shields,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def save_distribution(self, output_dir: Path) -> None:
|
def save_distribution(self, output_dir: Path) -> None:
|
||||||
|
@ -271,19 +166,14 @@ The following safety shields are configured:
|
||||||
with open(output_dir / "build.yaml", "w") as f:
|
with open(output_dir / "build.yaml", "w") as f:
|
||||||
yaml.safe_dump(build_config.model_dump(), f, sort_keys=False)
|
yaml.safe_dump(build_config.model_dump(), f, sort_keys=False)
|
||||||
|
|
||||||
run_config = self.run_config()
|
for yaml_pth, settings in self.run_configs.items():
|
||||||
serialized = run_config.model_dump()
|
print(f"Generating {yaml_pth}")
|
||||||
with open(output_dir / "run.yaml", "w") as f:
|
print(f"Providers: {self.providers}")
|
||||||
yaml.safe_dump(serialized, f, sort_keys=False)
|
run_config = settings.run_config(
|
||||||
|
self.name, self.providers, self.docker_image
|
||||||
# serialized_str = yaml.dump(serialized, sort_keys=False)
|
)
|
||||||
# env_vars = set()
|
with open(output_dir / yaml_pth, "w") as f:
|
||||||
# for match in re.finditer(r"\${env\.([A-Za-z0-9_-]+)}", serialized_str):
|
yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
|
||||||
# env_vars.add(match.group(1))
|
|
||||||
|
|
||||||
docker_compose = self.docker_compose_config()
|
|
||||||
with open(output_dir / "compose.yaml", "w") as f:
|
|
||||||
yaml.safe_dump(docker_compose, f, sort_keys=False, default_flow_style=False)
|
|
||||||
|
|
||||||
docs = self.generate_markdown_docs()
|
docs = self.generate_markdown_docs()
|
||||||
with open(output_dir / f"{self.name}.md", "w") as f:
|
with open(output_dir / f"{self.name}.md", "w") as f:
|
||||||
|
@ -291,87 +181,89 @@ The following safety shields are configured:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def vllm_distribution(cls) -> "DistributionTemplate":
|
def vllm_distribution(cls) -> "DistributionTemplate":
|
||||||
|
providers = {
|
||||||
|
"inference": ["remote::vllm"],
|
||||||
|
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||||
|
"safety": ["inline::llama-guard"],
|
||||||
|
"agents": ["inline::meta-reference"],
|
||||||
|
"telemetry": ["inline::meta-reference"],
|
||||||
|
}
|
||||||
|
|
||||||
|
inference_provider = Provider(
|
||||||
|
provider_id="vllm-inference",
|
||||||
|
provider_type="remote::vllm",
|
||||||
|
config=VLLMInferenceAdapterConfig.sample_run_config(
|
||||||
|
url="${env.VLLM_URL}",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
inference_model = ModelInput(
|
||||||
|
model_id="${env.INFERENCE_MODEL}",
|
||||||
|
provider_id="vllm-inference",
|
||||||
|
)
|
||||||
|
safety_model = ModelInput(
|
||||||
|
model_id="${env.SAFETY_MODEL}",
|
||||||
|
provider_id="vllm-safety",
|
||||||
|
)
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
name="remote-vllm",
|
name="remote-vllm",
|
||||||
description="Use (an external) vLLM server for running LLM inference",
|
description="Use (an external) vLLM server for running LLM inference",
|
||||||
providers={
|
template_path=Path(__file__).parent / "remote-vllm" / "doc_template.md",
|
||||||
"inference": ["remote::vllm"],
|
providers=providers,
|
||||||
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
default_models=[inference_model, safety_model],
|
||||||
"safety": ["inline::llama-guard"],
|
run_configs={
|
||||||
"agents": ["inline::meta-reference"],
|
"run.yaml": RunConfigSettings(
|
||||||
"telemetry": ["inline::meta-reference"],
|
provider_overrides={
|
||||||
},
|
"inference": [inference_provider],
|
||||||
run_config_overrides={
|
},
|
||||||
"inference": [
|
default_models=[inference_model],
|
||||||
Provider(
|
|
||||||
provider_id="vllm-0",
|
|
||||||
provider_type="remote::vllm",
|
|
||||||
config=VLLMInferenceAdapterConfig.sample_run_config(
|
|
||||||
url="${env.VLLM_URL:http://host.docker.internal:5100/v1}",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
Provider(
|
|
||||||
provider_id="vllm-1",
|
|
||||||
provider_type="remote::vllm",
|
|
||||||
config=VLLMInferenceAdapterConfig.sample_run_config(
|
|
||||||
url="${env.SAFETY_VLLM_URL:http://host.docker.internal:5101/v1}",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
},
|
|
||||||
compose_config_overrides={
|
|
||||||
"inference": {
|
|
||||||
"vllm-0": VLLMInferenceAdapterConfig.sample_docker_compose_config(
|
|
||||||
port=5100,
|
|
||||||
cuda_visible_devices="0",
|
|
||||||
model="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
|
|
||||||
),
|
|
||||||
"vllm-1": VLLMInferenceAdapterConfig.sample_docker_compose_config(
|
|
||||||
port=5100,
|
|
||||||
cuda_visible_devices="1",
|
|
||||||
model="${env.SAFETY_MODEL:Llama-Guard-3-1B}",
|
|
||||||
),
|
|
||||||
}
|
|
||||||
},
|
|
||||||
default_models=[
|
|
||||||
ModelInput(
|
|
||||||
model_id="${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
|
|
||||||
provider_id="vllm-0",
|
|
||||||
),
|
),
|
||||||
ModelInput(
|
"safety-run.yaml": RunConfigSettings(
|
||||||
model_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}",
|
provider_overrides={
|
||||||
provider_id="vllm-1",
|
"inference": [
|
||||||
|
inference_provider,
|
||||||
|
Provider(
|
||||||
|
provider_id="vllm-safety",
|
||||||
|
provider_type="remote::vllm",
|
||||||
|
config=VLLMInferenceAdapterConfig.sample_run_config(
|
||||||
|
url="${env.SAFETY_VLLM_URL}",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
default_models=[
|
||||||
|
inference_model,
|
||||||
|
safety_model,
|
||||||
|
],
|
||||||
|
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||||
),
|
),
|
||||||
],
|
},
|
||||||
default_shields=[
|
|
||||||
ShieldInput(shield_id="${env.SAFETY_MODEL:Llama-Guard-3-1B}")
|
|
||||||
],
|
|
||||||
docker_compose_env_vars={
|
docker_compose_env_vars={
|
||||||
# these defaults are for the Docker Compose configuration
|
"LLAMASTACK_PORT": (
|
||||||
"VLLM_URL": (
|
"5001",
|
||||||
"http://host.docker.internal:${VLLM_PORT:-5100}/v1",
|
"Port for the Llama Stack distribution server",
|
||||||
"URL of the vLLM server with the main inference model",
|
|
||||||
),
|
|
||||||
"SAFETY_VLLM_URL": (
|
|
||||||
"http://host.docker.internal:${SAFETY_VLLM_PORT:-5101}/v1",
|
|
||||||
"URL of the vLLM server with the safety model",
|
|
||||||
),
|
|
||||||
"MAX_TOKENS": (
|
|
||||||
"${MAX_TOKENS:-4096}",
|
|
||||||
"Maximum number of tokens for generation",
|
|
||||||
),
|
),
|
||||||
"INFERENCE_MODEL": (
|
"INFERENCE_MODEL": (
|
||||||
"${INFERENCE_MODEL:-Llama3.2-3B-Instruct}",
|
"meta-llama/Llama-3.2-3B-Instruct",
|
||||||
"Name of the inference model to use",
|
"Inference model loaded into the vLLM server",
|
||||||
|
),
|
||||||
|
"VLLM_URL": (
|
||||||
|
"http://host.docker.internal:5100}/v1",
|
||||||
|
"URL of the vLLM server with the main inference model",
|
||||||
|
),
|
||||||
|
"MAX_TOKENS": (
|
||||||
|
"4096",
|
||||||
|
"Maximum number of tokens for generation",
|
||||||
|
),
|
||||||
|
"SAFETY_VLLM_URL": (
|
||||||
|
"http://host.docker.internal:5101/v1",
|
||||||
|
"URL of the vLLM server with the safety model",
|
||||||
),
|
),
|
||||||
"SAFETY_MODEL": (
|
"SAFETY_MODEL": (
|
||||||
"${SAFETY_MODEL:-Llama-Guard-3-1B}",
|
"meta-llama/Llama-Guard-3-1B",
|
||||||
"Name of the safety (Llama-Guard) model to use",
|
"Name of the safety (Llama-Guard) model to use",
|
||||||
),
|
),
|
||||||
"LLAMASTACK_PORT": (
|
|
||||||
"${LLAMASTACK_PORT:-5001}",
|
|
||||||
"Port for the Llama Stack distribution server",
|
|
||||||
),
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue