This commit is contained in:
Ashwin Bharambe 2024-11-18 16:03:20 -08:00
parent 3aedde2ab4
commit 47c37fd831
3 changed files with 38 additions and 10 deletions

View file

@ -42,6 +42,7 @@ docker run \
-p $INFERENCE_PORT:$INFERENCE_PORT \ -p $INFERENCE_PORT:$INFERENCE_PORT \
--ipc=host \ --ipc=host \
vllm/vllm-openai:latest \ vllm/vllm-openai:latest \
--gpu-memory-utilization 0.7 \
--model $INFERENCE_MODEL \ --model $INFERENCE_MODEL \
--port $INFERENCE_PORT --port $INFERENCE_PORT
``` ```
@ -61,6 +62,7 @@ docker run \
-p $SAFETY_PORT:$SAFETY_PORT \ -p $SAFETY_PORT:$SAFETY_PORT \
--ipc=host \ --ipc=host \
vllm/vllm-openai:latest \ vllm/vllm-openai:latest \
--gpu-memory-utilization 0.7 \
--model $SAFETY_MODEL \ --model $SAFETY_MODEL \
--port $SAFETY_PORT --port $SAFETY_PORT
``` ```
@ -74,7 +76,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
This method allows you to get started quickly without having to build the distribution code. This method allows you to get started quickly without having to build the distribution code.
```bash ```bash
LLAMA_STACK_PORT=5001 export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -89,6 +94,9 @@ docker run \
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
export SAFETY_PORT=8081
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -108,9 +116,15 @@ docker run \
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export LLAMA_STACK_PORT=5001
cd distributions/remote-vllm
llama stack build --template remote-vllm --image-type conda llama stack build --template remote-vllm --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
``` ```
@ -119,7 +133,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
llama stack run ./run-with-safety.yaml \ llama stack run ./run-with-safety.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \ --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL \ --env SAFETY_MODEL=$SAFETY_MODEL \

View file

@ -41,7 +41,7 @@ while [[ $# -gt 0 ]]; do
if [[ -n "$2" ]]; then if [[ -n "$2" ]]; then
# collect environment variables so we can set them after activating the conda env # collect environment variables so we can set them after activating the conda env
env_vars="$env_vars $2" env_vars="$env_vars --env $2"
shift 2 shift 2
else else
echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2 echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
@ -58,8 +58,8 @@ eval "$(conda shell.bash hook)"
conda deactivate && conda activate "$env_name" conda deactivate && conda activate "$env_name"
set -x set -x
$env_vars \ $CONDA_PREFIX/bin/python \
$CONDA_PREFIX/bin/python \
-m llama_stack.distribution.server.server \ -m llama_stack.distribution.server.server \
--yaml_config "$yaml_config" \ --yaml_config "$yaml_config" \
--port "$port" "$@" --port "$port" \
"$env_vars"

View file

@ -34,6 +34,7 @@ docker run \
-p $INFERENCE_PORT:$INFERENCE_PORT \ -p $INFERENCE_PORT:$INFERENCE_PORT \
--ipc=host \ --ipc=host \
vllm/vllm-openai:latest \ vllm/vllm-openai:latest \
--gpu-memory-utilization 0.7 \
--model $INFERENCE_MODEL \ --model $INFERENCE_MODEL \
--port $INFERENCE_PORT --port $INFERENCE_PORT
``` ```
@ -53,6 +54,7 @@ docker run \
-p $SAFETY_PORT:$SAFETY_PORT \ -p $SAFETY_PORT:$SAFETY_PORT \
--ipc=host \ --ipc=host \
vllm/vllm-openai:latest \ vllm/vllm-openai:latest \
--gpu-memory-utilization 0.7 \
--model $SAFETY_MODEL \ --model $SAFETY_MODEL \
--port $SAFETY_PORT --port $SAFETY_PORT
``` ```
@ -66,7 +68,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
This method allows you to get started quickly without having to build the distribution code. This method allows you to get started quickly without having to build the distribution code.
```bash ```bash
LLAMA_STACK_PORT=5001 export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export LLAMA_STACK_PORT=5001
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -81,6 +86,9 @@ docker run \
If you are using Llama Stack Safety / Shield APIs, use: If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
export SAFETY_PORT=8081
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
docker run \ docker run \
-it \ -it \
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -100,9 +108,15 @@ docker run \
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
```bash ```bash
export INFERENCE_PORT=8000
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export LLAMA_STACK_PORT=5001
cd distributions/remote-vllm
llama stack build --template remote-vllm --image-type conda llama stack build --template remote-vllm --image-type conda
llama stack run ./run.yaml \ llama stack run ./run.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
``` ```
@ -111,7 +125,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
```bash ```bash
llama stack run ./run-with-safety.yaml \ llama stack run ./run-with-safety.yaml \
--port 5001 \ --port $LLAMA_STACK_PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \ --env INFERENCE_MODEL=$INFERENCE_MODEL \
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \ --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
--env SAFETY_MODEL=$SAFETY_MODEL \ --env SAFETY_MODEL=$SAFETY_MODEL \