Fixes

2024-11-18 16:03:20 -08:00 · 2024-11-18 16:03:20 -08:00 · 47c37fd831
commit 47c37fd831
parent 3aedde2ab4
3 changed files with 38 additions and 10 deletions
--- a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md
@ -42,6 +42,7 @@ docker run \
    -p $INFERENCE_PORT:$INFERENCE_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
+    --gpu-memory-utilization 0.7 \
    --model $INFERENCE_MODEL \
    --port $INFERENCE_PORT
 ```
@ -61,6 +62,7 @@ docker run \
    -p $SAFETY_PORT:$SAFETY_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
+    --gpu-memory-utilization 0.7 \
    --model $SAFETY_MODEL \
    --port $SAFETY_PORT
 ```
@ -74,7 +76,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export LLAMA_STACK_PORT=5001
+
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -89,6 +94,9 @@ docker run \
 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -108,9 +116,15 @@ docker run \
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export LLAMA_STACK_PORT=5001
+
+cd distributions/remote-vllm
 llama stack build --template remote-vllm --image-type conda
+
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
@ -119,7 +133,7 @@ If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
 llama stack run ./run-with-safety.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \
--- a/llama_stack/distribution/start_conda_env.sh
+++ b/llama_stack/distribution/start_conda_env.sh
@ -41,7 +41,7 @@ while [[ $# -gt 0 ]]; do

            if [[ -n "$2" ]]; then
                # collect environment variables so we can set them after activating the conda env
-                env_vars="$env_vars $2"
+                env_vars="$env_vars --env $2"
                shift 2
            else
                echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
@ -58,8 +58,8 @@ eval "$(conda shell.bash hook)"
 conda deactivate && conda activate "$env_name"

 set -x
-$env_vars \
-  $CONDA_PREFIX/bin/python \
+$CONDA_PREFIX/bin/python \
  -m llama_stack.distribution.server.server \
  --yaml_config "$yaml_config" \
-  --port "$port" "$@"
+  --port "$port" \
+  "$env_vars"
--- a/llama_stack/templates/remote-vllm/doc_template.md
+++ b/llama_stack/templates/remote-vllm/doc_template.md
@ -34,6 +34,7 @@ docker run \
    -p $INFERENCE_PORT:$INFERENCE_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
+    --gpu-memory-utilization 0.7 \
    --model $INFERENCE_MODEL \
    --port $INFERENCE_PORT
 ```
@ -53,6 +54,7 @@ docker run \
    -p $SAFETY_PORT:$SAFETY_PORT \
    --ipc=host \
    vllm/vllm-openai:latest \
+    --gpu-memory-utilization 0.7 \
    --model $SAFETY_MODEL \
    --port $SAFETY_PORT
 ```
@ -66,7 +68,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
 This method allows you to get started quickly without having to build the distribution code.

 ```bash
-LLAMA_STACK_PORT=5001
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export LLAMA_STACK_PORT=5001
+
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -81,6 +86,9 @@ docker run \
 If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
+export SAFETY_PORT=8081
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+
 docker run \
  -it \
  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
@ -100,9 +108,15 @@ docker run \
 Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.

 ```bash
+export INFERENCE_PORT=8000
+export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export LLAMA_STACK_PORT=5001
+
+cd distributions/remote-vllm
 llama stack build --template remote-vllm --image-type conda
+
 llama stack run ./run.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
 ```
@ -111,7 +125,7 @@ If you are using Llama Stack Safety / Shield APIs, use:

 ```bash
 llama stack run ./run-with-safety.yaml \
-  --port 5001 \
+  --port $LLAMA_STACK_PORT \
  --env INFERENCE_MODEL=$INFERENCE_MODEL \
  --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
  --env SAFETY_MODEL=$SAFETY_MODEL \