add inline-vllm details, fix things

This commit is contained in:
Ashwin Bharambe 2024-11-08 12:01:05 -08:00
parent 02c66b49fc
commit 38cdbdec5a
12 changed files with 142 additions and 101 deletions

View file

@ -1 +1 @@
../../llama_stack/templates/vllm/build.yaml
../../llama_stack/templates/inline-vllm/build.yaml

View file

@ -0,0 +1,35 @@
services:
llamastack:
image: llamastack/distribution-inline-vllm
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
devices:
- nvidia.com/gpu=all
environment:
- CUDA_VISIBLE_DEVICES=0
command: []
deploy:
resources:
reservations:
devices:
- driver: nvidia
# that's the closest analogue to --gpus; provide
# an integer amount of devices or 'all'
count: 1
# Devices are reserved using a list of capabilities, making
# capabilities the only required field. A device MUST
# satisfy all the requested capabilities for a successful
# reservation.
capabilities: [gpu]
runtime: nvidia
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -0,0 +1,66 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: vllm-inference
provider_type: inline::vllm
config:
model: Llama3.2-3B-Instruct
tensor_parallel_size: 1
gpu_memory_utilization: 0.4
enforce_eager: true
max_tokens: 4096
- provider_id: vllm-safety
provider_type: inline::vllm
config:
model: Llama-Guard-3-1B
tensor_parallel_size: 1
gpu_memory_utilization: 0.2
enforce_eager: true
max_tokens: 4096
safety:
- provider_id: meta0
provider_type: meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
# Uncomment to use prompt guard
# prompt_guard_shield:
# model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: meta-reference
config: {}
# Uncomment to use pgvector
# - provider_id: pgvector
# provider_type: remote::pgvector
# config:
# host: 127.0.0.1
# port: 5432
# db: postgres
# user: postgres
# password: mysecretpassword
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/agents_store.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -1 +1 @@
../../llama_stack/templates/ollama/build.yaml
../../llama_stack/templates/remote-vllm/build.yaml

View file

@ -1,11 +1,11 @@
services:
ollama:
image: ollama/ollama:latest
vllm:
image: vllm/vllm-openai:latest
network_mode: "host"
volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
- $HOME/.cache/huggingface:/root/.cache/huggingface
ports:
- "11434:11434"
- "8000:8000"
devices:
- nvidia.com/gpu=all
environment:
@ -27,17 +27,17 @@ services:
runtime: nvidia
llamastack:
depends_on:
- ollama
image: llamastack/distribution-ollama
- vllm
image: llamastack/distribution-remote-vllm
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/llamastack-run-ollama.yaml
- ./run.yaml:/root/llamastack-run-remote-vllm.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
# Hack: wait for vllm server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml"
deploy:
restart_policy:
condition: on-failure
@ -45,4 +45,4 @@ services:
max_attempts: 5
window: 60s
volumes:
ollama:
vllm:

View file

@ -1,30 +0,0 @@
services:
ollama:
image: ollama/ollama:latest
network_mode: "host"
volumes:
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
ports:
- "11434:11434"
command: []
llamastack:
depends_on:
- ollama
image: llamastack/distribution-ollama
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to ollama run.yaml file
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
# Hack: wait for ollama server to start before starting docker
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
deploy:
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s
volumes:
ollama:

View file

@ -1,46 +0,0 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: ollama0
provider_type: remote::ollama
config:
url: http://127.0.0.1:14343
safety:
- provider_id: meta0
provider_type: meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -13,10 +13,10 @@ apis:
- safety
providers:
inference:
- provider_id: ollama0
provider_type: remote::ollama
- provider_id: vllm0
provider_type: remote::vllm
config:
url: http://127.0.0.1:14343
url: http://127.0.0.1:8000
safety:
- provider_id: meta0
provider_type: meta-reference

View file

@ -45,7 +45,7 @@ def available_providers() -> List[ProviderSpec]:
),
InlineProviderSpec(
api=Api.inference,
provider_type="vllm",
provider_type="inline::vllm",
pip_packages=[
"vllm",
],

View file

@ -0,0 +1,13 @@
name: meta-reference-gpu
distribution_spec:
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
description: Use code from `llama_stack` itself to serve all llama stack APIs
providers:
inference: meta-reference
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -0,0 +1,12 @@
name: remote-vllm
distribution_spec:
description: Use (an external) vLLM server for running LLM inference
providers:
inference: remote::vllm
memory:
- meta-reference
- remote::chromadb
- remote::pgvector
safety: meta-reference
agents: meta-reference
telemetry: meta-reference

View file

@ -1,9 +0,0 @@
name: vllm
distribution_spec:
description: Like local, but use vLLM for running LLM inference
providers:
inference: vllm
memory: meta-reference
safety: meta-reference
agents: meta-reference
telemetry: meta-reference