mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
fix: Default to port 8321 everywhere (#1734)
As titled, moved all instances of 5001 to 8321
This commit is contained in:
parent
581e8ae562
commit
127bac6869
56 changed files with 2352 additions and 2305 deletions
|
@ -51,14 +51,14 @@ services:
|
||||||
- ~/local/llama-stack/:/app/llama-stack-source
|
- ~/local/llama-stack/:/app/llama-stack-source
|
||||||
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
- ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||||
ports:
|
ports:
|
||||||
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
||||||
environment:
|
environment:
|
||||||
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
- INFERENCE_MODEL=${INFERENCE_MODEL}
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
- SAFETY_MODEL=${SAFETY_MODEL:-}
|
||||||
- OLLAMA_URL=http://ollama:11434
|
- OLLAMA_URL=http://ollama:11434
|
||||||
entrypoint: >
|
entrypoint: >
|
||||||
python -m llama_stack.distribution.server.server /root/my-run.yaml \
|
python -m llama_stack.distribution.server.server /root/my-run.yaml \
|
||||||
--port ${LLAMA_STACK_PORT:-5001}
|
--port ${LLAMA_STACK_PORT:-8321}
|
||||||
deploy:
|
deploy:
|
||||||
restart_policy:
|
restart_policy:
|
||||||
condition: on-failure
|
condition: on-failure
|
||||||
|
|
|
@ -84,9 +84,9 @@ services:
|
||||||
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
- SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
|
||||||
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
- SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
|
||||||
ports:
|
ports:
|
||||||
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
||||||
# Hack: wait for vLLM server to start before starting docker
|
# Hack: wait for vLLM server to start before starting docker
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001"
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
|
||||||
deploy:
|
deploy:
|
||||||
restart_policy:
|
restart_policy:
|
||||||
condition: on-failure
|
condition: on-failure
|
||||||
|
|
|
@ -83,7 +83,7 @@ services:
|
||||||
- ~/.llama:/root/.llama
|
- ~/.llama:/root/.llama
|
||||||
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
- ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
|
||||||
ports:
|
ports:
|
||||||
- "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
|
- "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
|
||||||
# Hack: wait for TGI server to start before starting docker
|
# Hack: wait for TGI server to start before starting docker
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
|
||||||
restart_policy:
|
restart_policy:
|
||||||
|
|
|
@ -58,7 +58,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -75,7 +75,7 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template nvidia --image-type conda
|
llama stack build --template nvidia --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
```
|
```
|
||||||
|
|
|
@ -28,7 +28,7 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
|
||||||
|
@ -53,7 +53,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -20,7 +20,7 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `CEREBRAS_API_KEY`: Cerebras API Key (default: ``)
|
- `CEREBRAS_API_KEY`: Cerebras API Key (default: ``)
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
@ -45,7 +45,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -62,6 +62,6 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template cerebras --image-type conda
|
llama stack build --template cerebras --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
|
--env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
|
||||||
```
|
```
|
||||||
|
|
|
@ -30,7 +30,7 @@ The `llamastack/distribution-fireworks` distribution consists of the following p
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``)
|
- `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``)
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
@ -63,7 +63,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -30,7 +30,7 @@ The `llamastack/distribution-groq` distribution consists of the following provid
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `GROQ_API_KEY`: Groq API Key (default: ``)
|
- `GROQ_API_KEY`: Groq API Key (default: ``)
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
@ -58,7 +58,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -32,7 +32,7 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||||
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
|
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
|
||||||
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
|
- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`)
|
||||||
|
@ -77,7 +77,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -109,7 +109,7 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template meta-reference-gpu --image-type conda
|
llama stack build --template meta-reference-gpu --image-type conda
|
||||||
llama stack run distributions/meta-reference-gpu/run.yaml \
|
llama stack run distributions/meta-reference-gpu/run.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -117,7 +117,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
|
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
```
|
```
|
||||||
|
|
|
@ -34,7 +34,7 @@ Note that you need access to nvidia GPUs to run this distribution. This distribu
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
- `INFERENCE_MODEL`: Inference model loaded into the Meta Reference server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||||
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
|
- `INFERENCE_CHECKPOINT_DIR`: Directory containing the Meta Reference model checkpoint (default: `null`)
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -15,7 +15,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
@ -39,7 +39,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -56,6 +56,6 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template nvidia --image-type conda
|
llama stack build --template nvidia --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||||
```
|
```
|
||||||
|
|
|
@ -32,7 +32,7 @@ You should use this distribution if you have a regular desktop machine without v
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `OLLAMA_URL`: URL of the Ollama server (default: `http://127.0.0.1:11434`)
|
- `OLLAMA_URL`: URL of the Ollama server (default: `http://127.0.0.1:11434`)
|
||||||
- `INFERENCE_MODEL`: Inference model loaded into the Ollama server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
- `INFERENCE_MODEL`: Inference model loaded into the Ollama server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||||
- `SAFETY_MODEL`: Safety model loaded into the Ollama server (default: `meta-llama/Llama-Guard-3-1B`)
|
- `SAFETY_MODEL`: Safety model loaded into the Ollama server (default: `meta-llama/Llama-Guard-3-1B`)
|
||||||
|
@ -71,7 +71,7 @@ Now you are ready to run Llama Stack with Ollama as the inference provider. You
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -109,7 +109,7 @@ docker run \
|
||||||
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
|
|
||||||
llama stack build --template ollama --image-type conda
|
llama stack build --template ollama --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
|
|
|
@ -30,7 +30,7 @@ The `llamastack/distribution-passthrough` distribution consists of the following
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `PASSTHROUGH_API_KEY`: Passthrough API Key (default: ``)
|
- `PASSTHROUGH_API_KEY`: Passthrough API Key (default: ``)
|
||||||
- `PASSTHROUGH_URL`: Passthrough URL (default: ``)
|
- `PASSTHROUGH_URL`: Passthrough URL (default: ``)
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ You can use this distribution if you have GPUs and want to run an independent vL
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||||
- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100/v1`)
|
- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100/v1`)
|
||||||
- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
|
- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`)
|
||||||
|
@ -96,7 +96,7 @@ This method allows you to get started quickly without having to build the distri
|
||||||
```bash
|
```bash
|
||||||
export INFERENCE_PORT=8000
|
export INFERENCE_PORT=8000
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
|
@ -143,7 +143,7 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
|
||||||
```bash
|
```bash
|
||||||
export INFERENCE_PORT=8000
|
export INFERENCE_PORT=8000
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
|
|
||||||
cd distributions/remote-vllm
|
cd distributions/remote-vllm
|
||||||
llama stack build --template remote-vllm --image-type conda
|
llama stack build --template remote-vllm --image-type conda
|
||||||
|
|
|
@ -27,7 +27,7 @@ The `llamastack/distribution-sambanova` distribution consists of the following p
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
|
- `SAMBANOVA_API_KEY`: SambaNova.AI API Key (default: ``)
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
@ -59,7 +59,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -33,7 +33,7 @@ You can use this distribution if you have GPUs and want to run an independent TG
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`)
|
||||||
- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080/v1`)
|
- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080/v1`)
|
||||||
- `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`)
|
- `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`)
|
||||||
|
@ -92,7 +92,7 @@ Now you are ready to run Llama Stack with TGI as the inference provider. You can
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -30,7 +30,7 @@ The `llamastack/distribution-together` distribution consists of the following pr
|
||||||
|
|
||||||
The following environment variables can be configured:
|
The following environment variables can be configured:
|
||||||
|
|
||||||
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`)
|
||||||
- `TOGETHER_API_KEY`: Together.AI API Key (default: ``)
|
- `TOGETHER_API_KEY`: Together.AI API Key (default: ``)
|
||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
@ -64,7 +64,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -48,7 +48,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"HOST = \"localhost\" # Replace with your host\n",
|
"HOST = \"localhost\" # Replace with your host\n",
|
||||||
"PORT = 5001 # Replace with your port\n",
|
"PORT = 8321 # Replace with your port\n",
|
||||||
"MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'"
|
"MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -369,6 +369,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "7da25939-a2a3-463c-958e-9cdfd710d158",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -386,7 +389,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.15"
|
"version": "3.10.15"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,7 +43,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"#### 2. Set Up Local and Cloud Clients\n",
|
"#### 2. Set Up Local and Cloud Clients\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:8321` and the cloud distribution running on `http://localhost:5001`.\n"
|
"Initialize both clients, specifying the `base_url` for each instance. In this case, we have the local distribution running on `http://localhost:8321` and the cloud distribution running on `http://localhost:8322`.\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -236,6 +236,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "e11939ac-dfbc-4a1c-83be-e494c7f803b8",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -253,7 +256,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.15"
|
"version": "3.10.15"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"HOST = \"localhost\" # Replace with your host\n",
|
"HOST = \"localhost\" # Replace with your host\n",
|
||||||
"PORT = 5001 # Replace with your port\n",
|
"PORT = 8321 # Replace with your port\n",
|
||||||
"MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'"
|
"MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -281,6 +281,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "b1b93b6e-22a2-4c24-8cb0-161fdafff29a",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "base",
|
"display_name": "base",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -298,7 +301,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.2"
|
"version": "3.12.2"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"HOST = \"localhost\" # Replace with your host\n",
|
"HOST = \"localhost\" # Replace with your host\n",
|
||||||
"CLOUD_PORT = 5001 # Replace with your cloud distro port\n",
|
"CLOUD_PORT = 8321 # Replace with your cloud distro port\n",
|
||||||
"MODEL_NAME='Llama3.2-11B-Vision-Instruct'"
|
"MODEL_NAME='Llama3.2-11B-Vision-Instruct'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -180,6 +180,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "37bbbfda-8e42-446c-89c7-59dd49e2d339",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "base",
|
"display_name": "base",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -197,7 +200,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.2"
|
"version": "3.12.2"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
"nest_asyncio.apply()\n",
|
"nest_asyncio.apply()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"HOST = \"localhost\"\n",
|
"HOST = \"localhost\"\n",
|
||||||
"PORT = 5001\n",
|
"PORT = 8321\n",
|
||||||
"MODEL_NAME = \"meta-llama/Llama-3.2-3B-Instruct\"\n"
|
"MODEL_NAME = \"meta-llama/Llama-3.2-3B-Instruct\"\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -335,6 +335,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "f0abbf6d-ed52-40ad-afb4-f5ec99130249",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -352,7 +355,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.15"
|
"version": "3.10.15"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"HOST = \"localhost\" # Replace with your host\n",
|
"HOST = \"localhost\" # Replace with your host\n",
|
||||||
"PORT = 5001 # Replace with your port\n",
|
"PORT = 8321 # Replace with your port\n",
|
||||||
"MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'\n",
|
"MODEL_NAME='meta-llama/Llama-3.2-3B-Instruct'\n",
|
||||||
"MEMORY_BANK_ID=\"tutorial_bank\""
|
"MEMORY_BANK_ID=\"tutorial_bank\""
|
||||||
]
|
]
|
||||||
|
@ -378,6 +378,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "73bc3357-0e5e-42ff-95b1-40b916d24c4f",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -395,7 +398,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.15"
|
"version": "3.10.15"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,7 +49,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"HOST = \"localhost\" # Replace with your host\n",
|
"HOST = \"localhost\" # Replace with your host\n",
|
||||||
"PORT = 5001 # Replace with your port\n",
|
"PORT = 8321 # Replace with your port\n",
|
||||||
"SHEILD_NAME=\"meta-llama/Llama-Guard-3-1B\""
|
"SHEILD_NAME=\"meta-llama/Llama-Guard-3-1B\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -112,6 +112,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "9afaddb7-c2fb-4309-8fa0-761697de53f0",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -129,7 +132,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.10"
|
"version": "3.11.10"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,7 +50,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"HOST = \"localhost\" # Replace with your host\n",
|
"HOST = \"localhost\" # Replace with your host\n",
|
||||||
"PORT = 5001 # Replace with your port\n",
|
"PORT = 8321 # Replace with your port\n",
|
||||||
"MODEL_NAME = \"meta-llama/Llama-3.2-3B-Instruct\"\n"
|
"MODEL_NAME = \"meta-llama/Llama-3.2-3B-Instruct\"\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -168,6 +168,9 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"fileHeader": "",
|
||||||
|
"fileUid": "8de24775-c4a0-49c7-904e-608264f69292",
|
||||||
|
"isAdHoc": false,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
|
@ -185,7 +188,5 @@
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.15"
|
"version": "3.10.15"
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,7 +96,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
|
||||||
3. **Set the ENV variables by exporting them to the terminal**:
|
3. **Set the ENV variables by exporting them to the terminal**:
|
||||||
```bash
|
```bash
|
||||||
export OLLAMA_URL="http://localhost:11434"
|
export OLLAMA_URL="http://localhost:11434"
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
|
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
|
||||||
export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
|
export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
|
||||||
```
|
```
|
||||||
|
@ -112,7 +112,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next
|
||||||
```
|
```
|
||||||
Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
|
Note: Every time you run a new model with `ollama run`, you will need to restart the llama stack. Otherwise it won't see the new model.
|
||||||
|
|
||||||
The server will start and listen on `http://localhost:5001`.
|
The server will start and listen on `http://localhost:8321`.
|
||||||
|
|
||||||
---
|
---
|
||||||
## Test with `llama-stack-client` CLI
|
## Test with `llama-stack-client` CLI
|
||||||
|
@ -120,11 +120,11 @@ After setting up the server, open a new terminal window and configure the llama-
|
||||||
|
|
||||||
1. Configure the CLI to point to the llama-stack server.
|
1. Configure the CLI to point to the llama-stack server.
|
||||||
```bash
|
```bash
|
||||||
llama-stack-client configure --endpoint http://localhost:5001
|
llama-stack-client configure --endpoint http://localhost:8321
|
||||||
```
|
```
|
||||||
**Expected Output:**
|
**Expected Output:**
|
||||||
```bash
|
```bash
|
||||||
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:5001
|
Done! You can now use the Llama Stack Client CLI with endpoint http://localhost:8321
|
||||||
```
|
```
|
||||||
2. Test the CLI by running inference:
|
2. Test the CLI by running inference:
|
||||||
```bash
|
```bash
|
||||||
|
@ -218,7 +218,7 @@ if INFERENCE_MODEL is None:
|
||||||
raise ValueError("The environment variable 'INFERENCE_MODEL' is not set.")
|
raise ValueError("The environment variable 'INFERENCE_MODEL' is not set.")
|
||||||
|
|
||||||
# Initialize the clien
|
# Initialize the clien
|
||||||
client = LlamaStackClient(base_url="http://localhost:5001")
|
client = LlamaStackClient(base_url="http://localhost:8321")
|
||||||
|
|
||||||
# Create a chat completion reques
|
# Create a chat completion reques
|
||||||
response = client.inference.chat_completion(
|
response = client.inference.chat_completion(
|
||||||
|
|
|
@ -9,7 +9,11 @@ from pathlib import Path
|
||||||
from llama_stack.distribution.datatypes import Provider, ToolGroupInput
|
from llama_stack.distribution.datatypes import Provider, ToolGroupInput
|
||||||
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
||||||
from llama_stack.providers.remote.inference.bedrock.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.bedrock.models import MODEL_ENTRIES
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -76,7 +80,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
|
|
@ -47,7 +47,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -14,7 +14,11 @@ from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
||||||
from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig
|
from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig
|
||||||
from llama_stack.providers.remote.inference.cerebras.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.cerebras.models import MODEL_ENTRIES
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -100,7 +104,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"CEREBRAS_API_KEY": (
|
"CEREBRAS_API_KEY": (
|
||||||
|
|
|
@ -39,7 +39,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -56,6 +56,6 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template cerebras --image-type conda
|
llama stack build --template cerebras --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
|
--env CEREBRAS_API_KEY=$CEREBRAS_API_KEY
|
||||||
```
|
```
|
||||||
|
|
|
@ -15,10 +15,16 @@ from llama_stack.distribution.datatypes import (
|
||||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
SentenceTransformersInferenceConfig,
|
SentenceTransformersInferenceConfig,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig
|
from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
|
||||||
|
SQLiteVectorIOConfig,
|
||||||
|
)
|
||||||
from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
|
from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
|
||||||
from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -104,7 +110,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"FIREWORKS_API_KEY": (
|
"FIREWORKS_API_KEY": (
|
||||||
|
|
|
@ -16,20 +16,38 @@ from llama_stack.distribution.datatypes import (
|
||||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
SentenceTransformersInferenceConfig,
|
SentenceTransformersInferenceConfig,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig
|
from llama_stack.providers.inline.vector_io.sqlite_vec.config import (
|
||||||
|
SQLiteVectorIOConfig,
|
||||||
|
)
|
||||||
from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig
|
from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig
|
||||||
from llama_stack.providers.remote.inference.anthropic.models import MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.anthropic.models import (
|
||||||
|
MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES,
|
||||||
|
)
|
||||||
from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
|
from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
|
||||||
from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.fireworks.models import (
|
||||||
|
MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES,
|
||||||
|
)
|
||||||
from llama_stack.providers.remote.inference.gemini.config import GeminiConfig
|
from llama_stack.providers.remote.inference.gemini.config import GeminiConfig
|
||||||
from llama_stack.providers.remote.inference.gemini.models import MODEL_ENTRIES as GEMINI_MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.gemini.models import (
|
||||||
|
MODEL_ENTRIES as GEMINI_MODEL_ENTRIES,
|
||||||
|
)
|
||||||
from llama_stack.providers.remote.inference.groq.config import GroqConfig
|
from llama_stack.providers.remote.inference.groq.config import GroqConfig
|
||||||
from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES as GROQ_MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.groq.models import (
|
||||||
|
MODEL_ENTRIES as GROQ_MODEL_ENTRIES,
|
||||||
|
)
|
||||||
from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
|
from llama_stack.providers.remote.inference.openai.config import OpenAIConfig
|
||||||
from llama_stack.providers.remote.inference.openai.models import MODEL_ENTRIES as OPENAI_MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.openai.models import (
|
||||||
|
MODEL_ENTRIES as OPENAI_MODEL_ENTRIES,
|
||||||
|
)
|
||||||
from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
|
from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
|
||||||
from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig
|
from llama_stack.providers.remote.vector_io.pgvector.config import (
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
PGVectorVectorIOConfig,
|
||||||
|
)
|
||||||
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]:
|
def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]:
|
||||||
|
@ -168,7 +186,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"FIREWORKS_API_KEY": (
|
"FIREWORKS_API_KEY": (
|
||||||
|
|
|
@ -49,7 +49,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -19,7 +19,11 @@ from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
||||||
from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
|
from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig
|
||||||
from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -158,7 +162,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"FIREWORKS_API_KEY": (
|
"FIREWORKS_API_KEY": (
|
||||||
|
|
|
@ -49,7 +49,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -7,17 +7,17 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from llama_stack.apis.models.models import ModelType
|
from llama_stack.apis.models.models import ModelType
|
||||||
from llama_stack.distribution.datatypes import (
|
from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput
|
||||||
ModelInput,
|
|
||||||
Provider,
|
|
||||||
ToolGroupInput,
|
|
||||||
)
|
|
||||||
from llama_stack.providers.inline.inference.sentence_transformers import (
|
from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
SentenceTransformersInferenceConfig,
|
SentenceTransformersInferenceConfig,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.remote.inference.groq import GroqConfig
|
from llama_stack.providers.remote.inference.groq import GroqConfig
|
||||||
from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -97,7 +97,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMASTACK_PORT": (
|
"LLAMASTACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"GROQ_API_KEY": (
|
"GROQ_API_KEY": (
|
||||||
|
|
|
@ -127,7 +127,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"HF_API_TOKEN": (
|
"HF_API_TOKEN": (
|
||||||
|
|
|
@ -128,7 +128,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"HF_API_TOKEN": (
|
"HF_API_TOKEN": (
|
||||||
|
|
|
@ -65,7 +65,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -97,7 +97,7 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template {{ name }} --image-type conda
|
llama stack build --template {{ name }} --image-type conda
|
||||||
llama stack run distributions/{{ name }}/run.yaml \
|
llama stack run distributions/{{ name }}/run.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run distributions/{{ name }}/run-with-safety.yaml \
|
llama stack run distributions/{{ name }}/run-with-safety.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
```
|
```
|
||||||
|
|
|
@ -134,7 +134,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"INFERENCE_MODEL": (
|
"INFERENCE_MODEL": (
|
||||||
|
|
|
@ -67,7 +67,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -100,7 +100,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"INFERENCE_MODEL": (
|
"INFERENCE_MODEL": (
|
||||||
|
|
|
@ -39,7 +39,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -56,7 +56,7 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template nvidia --image-type conda
|
llama stack build --template nvidia --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port 8321 \
|
||||||
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
--env NVIDIA_API_KEY=$NVIDIA_API_KEY
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||||
```
|
```
|
||||||
|
|
|
@ -60,7 +60,7 @@ Now you are ready to run Llama Stack with Ollama as the inference provider. You
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
@ -98,7 +98,7 @@ docker run \
|
||||||
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
|
|
||||||
llama stack build --template {{ name }} --image-type conda
|
llama stack build --template {{ name }} --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
|
|
|
@ -138,7 +138,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"OLLAMA_URL": (
|
"OLLAMA_URL": (
|
||||||
|
|
|
@ -279,7 +279,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"TOGETHER_API_KEY": (
|
"TOGETHER_API_KEY": (
|
||||||
|
|
|
@ -21,10 +21,7 @@ from llama_stack.providers.remote.inference.passthrough.config import (
|
||||||
PassthroughImplConfig,
|
PassthroughImplConfig,
|
||||||
)
|
)
|
||||||
from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
|
from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
|
||||||
from llama_stack.templates.template import (
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||||
DistributionTemplate,
|
|
||||||
RunConfigSettings,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -186,7 +183,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"PASSTHROUGH_API_KEY": (
|
"PASSTHROUGH_API_KEY": (
|
||||||
|
|
|
@ -83,7 +83,7 @@ This method allows you to get started quickly without having to build the distri
|
||||||
```bash
|
```bash
|
||||||
export INFERENCE_PORT=8000
|
export INFERENCE_PORT=8000
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
|
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
|
@ -130,7 +130,7 @@ Make sure you have done `uv pip install llama-stack` and have the Llama Stack CL
|
||||||
```bash
|
```bash
|
||||||
export INFERENCE_PORT=8000
|
export INFERENCE_PORT=8000
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=8321
|
||||||
|
|
||||||
cd distributions/remote-vllm
|
cd distributions/remote-vllm
|
||||||
llama stack build --template remote-vllm --image-type conda
|
llama stack build --template remote-vllm --image-type conda
|
||||||
|
|
|
@ -135,7 +135,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"INFERENCE_MODEL": (
|
"INFERENCE_MODEL": (
|
||||||
|
|
|
@ -49,7 +49,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -6,17 +6,19 @@
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from llama_stack.distribution.datatypes import (
|
from llama_stack.distribution.datatypes import Provider, ShieldInput, ToolGroupInput
|
||||||
Provider,
|
|
||||||
ShieldInput,
|
|
||||||
ToolGroupInput,
|
|
||||||
)
|
|
||||||
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
||||||
from llama_stack.providers.remote.inference.sambanova import SambaNovaImplConfig
|
from llama_stack.providers.remote.inference.sambanova import SambaNovaImplConfig
|
||||||
from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES
|
||||||
from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
|
from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig
|
||||||
from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig
|
from llama_stack.providers.remote.vector_io.pgvector.config import (
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
PGVectorVectorIOConfig,
|
||||||
|
)
|
||||||
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -105,7 +107,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMASTACK_PORT": (
|
"LLAMASTACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"SAMBANOVA_API_KEY": (
|
"SAMBANOVA_API_KEY": (
|
||||||
|
|
|
@ -80,7 +80,7 @@ Now you are ready to run Llama Stack with TGI as the inference provider. You can
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -129,7 +129,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"INFERENCE_MODEL": (
|
"INFERENCE_MODEL": (
|
||||||
|
|
|
@ -49,7 +49,7 @@ You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
This method allows you to get started quickly without having to build the distribution code.
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_STACK_PORT=5001
|
LLAMA_STACK_PORT=8321
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
--pull always \
|
--pull always \
|
||||||
|
|
|
@ -19,7 +19,11 @@ from llama_stack.providers.inline.inference.sentence_transformers import (
|
||||||
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
|
||||||
from llama_stack.providers.remote.inference.together import TogetherImplConfig
|
from llama_stack.providers.remote.inference.together import TogetherImplConfig
|
||||||
from llama_stack.providers.remote.inference.together.models import MODEL_ENTRIES
|
from llama_stack.providers.remote.inference.together.models import MODEL_ENTRIES
|
||||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry
|
from llama_stack.templates.template import (
|
||||||
|
DistributionTemplate,
|
||||||
|
RunConfigSettings,
|
||||||
|
get_model_registry,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_distribution_template() -> DistributionTemplate:
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
@ -154,7 +158,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"TOGETHER_API_KEY": (
|
"TOGETHER_API_KEY": (
|
||||||
|
|
|
@ -100,7 +100,7 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
},
|
},
|
||||||
run_config_env_vars={
|
run_config_env_vars={
|
||||||
"LLAMA_STACK_PORT": (
|
"LLAMA_STACK_PORT": (
|
||||||
"5001",
|
"8321",
|
||||||
"Port for the Llama Stack distribution server",
|
"Port for the Llama Stack distribution server",
|
||||||
),
|
),
|
||||||
"INFERENCE_MODEL": (
|
"INFERENCE_MODEL": (
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue