forked from phoenix-oss/llama-stack-mirror
Fixes
This commit is contained in:
parent
3aedde2ab4
commit
47c37fd831
3 changed files with 38 additions and 10 deletions
|
@ -42,6 +42,7 @@ docker run \
|
|||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $INFERENCE_MODEL \
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
@ -61,6 +62,7 @@ docker run \
|
|||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $SAFETY_MODEL \
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
@ -74,7 +76,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
|
|||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export LLAMA_STACK_PORT=5001
|
||||
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
|
@ -89,6 +94,9 @@ docker run \
|
|||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
|
@ -108,9 +116,15 @@ docker run \
|
|||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
```bash
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export LLAMA_STACK_PORT=5001
|
||||
|
||||
cd distributions/remote-vllm
|
||||
llama stack build --template remote-vllm --image-type conda
|
||||
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
```
|
||||
|
@ -119,7 +133,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
|||
|
||||
```bash
|
||||
llama stack run ./run-with-safety.yaml \
|
||||
--port 5001 \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
|
|
|
@ -41,7 +41,7 @@ while [[ $# -gt 0 ]]; do
|
|||
|
||||
if [[ -n "$2" ]]; then
|
||||
# collect environment variables so we can set them after activating the conda env
|
||||
env_vars="$env_vars $2"
|
||||
env_vars="$env_vars --env $2"
|
||||
shift 2
|
||||
else
|
||||
echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
|
||||
|
@ -58,8 +58,8 @@ eval "$(conda shell.bash hook)"
|
|||
conda deactivate && conda activate "$env_name"
|
||||
|
||||
set -x
|
||||
$env_vars \
|
||||
$CONDA_PREFIX/bin/python \
|
||||
$CONDA_PREFIX/bin/python \
|
||||
-m llama_stack.distribution.server.server \
|
||||
--yaml_config "$yaml_config" \
|
||||
--port "$port" "$@"
|
||||
--port "$port" \
|
||||
"$env_vars"
|
||||
|
|
|
@ -34,6 +34,7 @@ docker run \
|
|||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $INFERENCE_MODEL \
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
@ -53,6 +54,7 @@ docker run \
|
|||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--gpu-memory-utilization 0.7 \
|
||||
--model $SAFETY_MODEL \
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
@ -66,7 +68,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
|
|||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export LLAMA_STACK_PORT=5001
|
||||
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
|
@ -81,6 +86,9 @@ docker run \
|
|||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
|
@ -100,9 +108,15 @@ docker run \
|
|||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
```bash
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export LLAMA_STACK_PORT=5001
|
||||
|
||||
cd distributions/remote-vllm
|
||||
llama stack build --template remote-vllm --image-type conda
|
||||
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
```
|
||||
|
@ -111,7 +125,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
|||
|
||||
```bash
|
||||
llama stack run ./run-with-safety.yaml \
|
||||
--port 5001 \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue