forked from phoenix-oss/llama-stack-mirror
Fixes
This commit is contained in:
parent
3aedde2ab4
commit
47c37fd831
3 changed files with 38 additions and 10 deletions
|
@ -42,6 +42,7 @@ docker run \
|
||||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
vllm/vllm-openai:latest \
|
vllm/vllm-openai:latest \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
--model $INFERENCE_MODEL \
|
--model $INFERENCE_MODEL \
|
||||||
--port $INFERENCE_PORT
|
--port $INFERENCE_PORT
|
||||||
```
|
```
|
||||||
|
@ -61,6 +62,7 @@ docker run \
|
||||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
vllm/vllm-openai:latest \
|
vllm/vllm-openai:latest \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
--model $SAFETY_MODEL \
|
--model $SAFETY_MODEL \
|
||||||
--port $SAFETY_PORT
|
--port $SAFETY_PORT
|
||||||
```
|
```
|
||||||
|
@ -74,7 +76,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
export INFERENCE_PORT=8000
|
||||||
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
export LLAMA_STACK_PORT=5001
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
@ -89,6 +94,9 @@ docker run \
|
||||||
If you are using Llama Stack Safety / Shield APIs, use:
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
export SAFETY_PORT=8081
|
||||||
|
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
@ -108,9 +116,15 @@ docker run \
|
||||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
export INFERENCE_PORT=8000
|
||||||
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
export LLAMA_STACK_PORT=5001
|
||||||
|
|
||||||
|
cd distributions/remote-vllm
|
||||||
llama stack build --template remote-vllm --image-type conda
|
llama stack build --template remote-vllm --image-type conda
|
||||||
|
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||||
```
|
```
|
||||||
|
@ -119,7 +133,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run ./run-with-safety.yaml \
|
llama stack run ./run-with-safety.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||||
|
|
|
@ -41,7 +41,7 @@ while [[ $# -gt 0 ]]; do
|
||||||
|
|
||||||
if [[ -n "$2" ]]; then
|
if [[ -n "$2" ]]; then
|
||||||
# collect environment variables so we can set them after activating the conda env
|
# collect environment variables so we can set them after activating the conda env
|
||||||
env_vars="$env_vars $2"
|
env_vars="$env_vars --env $2"
|
||||||
shift 2
|
shift 2
|
||||||
else
|
else
|
||||||
echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
|
echo -e "${RED}Error: --env requires a KEY=VALUE argument${NC}" >&2
|
||||||
|
@ -58,8 +58,8 @@ eval "$(conda shell.bash hook)"
|
||||||
conda deactivate && conda activate "$env_name"
|
conda deactivate && conda activate "$env_name"
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
$env_vars \
|
$CONDA_PREFIX/bin/python \
|
||||||
$CONDA_PREFIX/bin/python \
|
|
||||||
-m llama_stack.distribution.server.server \
|
-m llama_stack.distribution.server.server \
|
||||||
--yaml_config "$yaml_config" \
|
--yaml_config "$yaml_config" \
|
||||||
--port "$port" "$@"
|
--port "$port" \
|
||||||
|
"$env_vars"
|
||||||
|
|
|
@ -34,6 +34,7 @@ docker run \
|
||||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
vllm/vllm-openai:latest \
|
vllm/vllm-openai:latest \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
--model $INFERENCE_MODEL \
|
--model $INFERENCE_MODEL \
|
||||||
--port $INFERENCE_PORT
|
--port $INFERENCE_PORT
|
||||||
```
|
```
|
||||||
|
@ -53,6 +54,7 @@ docker run \
|
||||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||||
--ipc=host \
|
--ipc=host \
|
||||||
vllm/vllm-openai:latest \
|
vllm/vllm-openai:latest \
|
||||||
|
--gpu-memory-utilization 0.7 \
|
||||||
--model $SAFETY_MODEL \
|
--model $SAFETY_MODEL \
|
||||||
--port $SAFETY_PORT
|
--port $SAFETY_PORT
|
||||||
```
|
```
|
||||||
|
@ -66,7 +68,10 @@ Now you are ready to run Llama Stack with vLLM as the inference provider. You ca
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
export INFERENCE_PORT=8000
|
||||||
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
export LLAMA_STACK_PORT=5001
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
@ -81,6 +86,9 @@ docker run \
|
||||||
If you are using Llama Stack Safety / Shield APIs, use:
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
export SAFETY_PORT=8081
|
||||||
|
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
@ -100,9 +108,15 @@ docker run \
|
||||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
export INFERENCE_PORT=8000
|
||||||
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
export LLAMA_STACK_PORT=5001
|
||||||
|
|
||||||
|
cd distributions/remote-vllm
|
||||||
llama stack build --template remote-vllm --image-type conda
|
llama stack build --template remote-vllm --image-type conda
|
||||||
|
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||||
```
|
```
|
||||||
|
@ -111,7 +125,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run ./run-with-safety.yaml \
|
llama stack run ./run-with-safety.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue