mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-31 16:01:46 +00:00
Update Fireworks + Togther documentation
This commit is contained in:
parent
1ecaf2cb3c
commit
a562668dcd
27 changed files with 879 additions and 445 deletions
|
@ -1,50 +1,91 @@
|
|||
version: '2'
|
||||
image_name: local
|
||||
image_name: fireworks
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
conda_env: null
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- memory
|
||||
- safety
|
||||
- telemetry
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: fireworks0
|
||||
- provider_id: fireworks
|
||||
provider_type: remote::fireworks
|
||||
config:
|
||||
url: https://api.fireworks.ai/inference
|
||||
# api_key: <ENTER_YOUR_API_KEY>
|
||||
safety:
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
api_key: ${env.FIREWORKS_API_KEY}
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::meta-reference
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/faiss_store.db
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config: {}
|
||||
# Uncomment to use weaviate memory provider
|
||||
# - provider_id: weaviate0
|
||||
# provider_type: remote::weaviate
|
||||
# config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/agents_store.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
metadata_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/registry.db
|
||||
models:
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-v3p1-8b-instruct
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-v3p1-70b-instruct
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-v3p1-405b-instruct
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-v3p2-1b-instruct
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-v3p2-3b-instruct
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-v3p2-11b-vision-instruct
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-v3p2-90b-vision-instruct
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-guard-3-8b
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: fireworks/llama-guard-3-11b-vision
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: meta-llama/Llama-Guard-3-8B
|
||||
provider_id: null
|
||||
provider_shield_id: null
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
version: '2'
|
||||
image_name: remote-vllm
|
||||
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
||||
docker_image: null
|
||||
conda_env: null
|
||||
apis:
|
||||
- agents
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
version: '2'
|
||||
image_name: remote-vllm
|
||||
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
||||
docker_image: null
|
||||
conda_env: null
|
||||
apis:
|
||||
- agents
|
||||
|
|
|
@ -1,45 +1,87 @@
|
|||
version: '2'
|
||||
image_name: local
|
||||
image_name: together
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
conda_env: null
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- memory
|
||||
- safety
|
||||
- telemetry
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: together0
|
||||
- provider_id: together
|
||||
provider_type: remote::together
|
||||
config:
|
||||
url: https://api.together.xyz/v1
|
||||
# api_key: <ENTER_YOUR_API_KEY>
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: inline::llama-guard
|
||||
config:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
- provider_id: meta1
|
||||
provider_type: inline::prompt-guard
|
||||
config:
|
||||
model: Prompt-Guard-86M
|
||||
api_key: ${env.TOGETHER_API_KEY}
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: remote::weaviate
|
||||
- provider_id: faiss
|
||||
provider_type: inline::faiss
|
||||
config:
|
||||
kvstore:
|
||||
type: sqlite
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/faiss_store.db
|
||||
safety:
|
||||
- provider_id: llama-guard
|
||||
provider_type: inline::llama-guard
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
namespace: null
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/agents_store.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config: {}
|
||||
metadata_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db
|
||||
models:
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Meta-Llama-Guard-3-8B
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
- metadata: {}
|
||||
model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo
|
||||
provider_id: null
|
||||
provider_model_id: null
|
||||
shields:
|
||||
- params: null
|
||||
shield_id: meta-llama/Llama-Guard-3-1B
|
||||
provider_id: null
|
||||
provider_shield_id: null
|
||||
memory_banks: []
|
||||
datasets: []
|
||||
scoring_fns: []
|
||||
eval_tasks: []
|
||||
|
|
|
@ -2,63 +2,67 @@
|
|||
|
||||
The `llamastack/distribution-fireworks` distribution consists of the following provider configurations.
|
||||
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| inference | `remote::fireworks` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|
||||
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
|
||||
| **Provider(s)** | remote::fireworks | meta-reference | meta-reference | meta-reference | meta-reference |
|
||||
|
||||
### Step 0. Prerequisite
|
||||
- Make sure you have access to a fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/)
|
||||
### Environment Variables
|
||||
|
||||
### Step 1. Start the Distribution (Single Node CPU)
|
||||
The following environment variables can be configured:
|
||||
|
||||
#### (Option 1) Start Distribution Via Docker
|
||||
> [!NOTE]
|
||||
> This assumes you have an hosted endpoint at Fireworks with API Key.
|
||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
||||
- `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``)
|
||||
|
||||
```
|
||||
$ cd distributions/fireworks && docker compose up
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `fireworks/llama-v3p1-8b-instruct`
|
||||
- `fireworks/llama-v3p1-70b-instruct`
|
||||
- `fireworks/llama-v3p1-405b-instruct`
|
||||
- `fireworks/llama-v3p2-1b-instruct`
|
||||
- `fireworks/llama-v3p2-3b-instruct`
|
||||
- `fireworks/llama-v3p2-11b-vision-instruct`
|
||||
- `fireworks/llama-v3p2-90b-vision-instruct`
|
||||
- `fireworks/llama-guard-3-8b`
|
||||
- `fireworks/llama-guard-3-11b-vision`
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/).
|
||||
|
||||
|
||||
## Running Llama Stack with Fireworks
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-fireworks \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||
```
|
||||
|
||||
Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g.
|
||||
```
|
||||
inference:
|
||||
- provider_id: fireworks
|
||||
provider_type: remote::fireworks
|
||||
config:
|
||||
url: https://api.fireworks.ai/inference
|
||||
api_key: <optional api key>
|
||||
```
|
||||
|
||||
#### (Option 2) Start Distribution Via Conda
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template fireworks --image-type conda
|
||||
# -- modify run.yaml to a valid Fireworks server endpoint
|
||||
llama stack run ./run.yaml
|
||||
```
|
||||
|
||||
|
||||
### (Optional) Model Serving
|
||||
|
||||
Use `llama-stack-client models list` to check the available models served by Fireworks.
|
||||
```
|
||||
$ llama-stack-client models list
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| identifier | llama_model | provider_id | metadata |
|
||||
+==============================+==============================+===============+============+
|
||||
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | fireworks0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | fireworks0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | fireworks0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.2-1B-Instruct | Llama3.2-1B-Instruct | fireworks0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | fireworks0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||
```
|
||||
|
|
|
@ -11,90 +11,97 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
|
|||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Models
|
||||
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
|
||||
|
||||
The following models are configured by default:
|
||||
- `${env.INFERENCE_MODEL}`
|
||||
- `${env.SAFETY_MODEL}`
|
||||
## Setting up Ollama server
|
||||
|
||||
## Using Docker Compose
|
||||
Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server.
|
||||
|
||||
You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
|
||||
In order to load models, you can run:
|
||||
|
||||
```bash
|
||||
$ cd distributions/ollama; docker compose up
|
||||
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
|
||||
|
||||
# ollama names this model differently, and we must use the ollama name when loading the model
|
||||
export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
|
||||
ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
|
||||
```
|
||||
|
||||
You will see outputs similar to following ---
|
||||
If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model.
|
||||
|
||||
```bash
|
||||
[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps"
|
||||
[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps"
|
||||
INFO: Started server process [1]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
|
||||
[llamastack] | Resolved 12 providers
|
||||
[llamastack] | inner-inference => ollama0
|
||||
[llamastack] | models => __routing_table__
|
||||
[llamastack] | inference => __autorouted__
|
||||
export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
|
||||
|
||||
# ollama names this model differently, and we must use the ollama name when loading the model
|
||||
export OLLAMA_SAFETY_MODEL="llama-guard3:1b"
|
||||
ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m
|
||||
```
|
||||
|
||||
To kill the server
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
docker compose down
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
--gpus=all \
|
||||
llamastack/distribution-ollama \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env OLLAMA_URL=http://host.docker.internal:11434
|
||||
```
|
||||
|
||||
## Starting Ollama and Llama Stack separately
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
|
||||
|
||||
#### Start Ollama server
|
||||
- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
|
||||
|
||||
**Via Docker**
|
||||
```bash
|
||||
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
||||
--gpus=all \
|
||||
llamastack/distribution-ollama \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env OLLAMA_URL=http://host.docker.internal:11434
|
||||
```
|
||||
|
||||
**Via CLI**
|
||||
```bash
|
||||
ollama run <model_id>
|
||||
```
|
||||
### Via Conda
|
||||
|
||||
#### Start Llama Stack server pointing to Ollama server
|
||||
|
||||
**Via Conda**
|
||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
```bash
|
||||
llama stack build --template ollama --image-type conda
|
||||
llama stack run run.yaml
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env OLLAMA_URL=http://127.0.0.1:11434
|
||||
```
|
||||
|
||||
**Via Docker**
|
||||
```
|
||||
docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
|
||||
```
|
||||
|
||||
Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
|
||||
```yaml
|
||||
inference:
|
||||
- provider_id: ollama0
|
||||
provider_type: remote::ollama
|
||||
config:
|
||||
url: http://127.0.0.1:14343
|
||||
```
|
||||
|
||||
### (Optional) Update Model Serving Configuration
|
||||
|
||||
#### Downloading model via Ollama
|
||||
|
||||
You can use ollama for managing model downloads.
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1:8b-instruct-fp16
|
||||
ollama pull llama3.1:70b-instruct-fp16
|
||||
llama stack run ./run-with-safety.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env OLLAMA_URL=http://127.0.0.1:11434
|
||||
```
|
||||
|
||||
|
||||
### (Optional) Update Model Serving Configuration
|
||||
|
||||
> [!NOTE]
|
||||
> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
|
||||
|
||||
|
|
|
@ -12,77 +12,106 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following
|
|||
|
||||
|
||||
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
|
||||
### Models
|
||||
|
||||
The following models are configured by default:
|
||||
- `${env.INFERENCE_MODEL}`
|
||||
- `${env.SAFETY_MODEL}`
|
||||
|
||||
## Using Docker Compose
|
||||
|
||||
You can use `docker compose` to start a vLLM container and Llama Stack server container together.
|
||||
```bash
|
||||
$ cd distributions/remote-vllm; docker compose up
|
||||
```
|
||||
## Setting up vLLM server
|
||||
|
||||
You will see outputs similar to following ---
|
||||
```
|
||||
<TO BE FILLED>
|
||||
```
|
||||
|
||||
To kill the server
|
||||
```bash
|
||||
docker compose down
|
||||
```
|
||||
|
||||
## Starting vLLM and Llama Stack separately
|
||||
|
||||
You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
|
||||
|
||||
#### Start vLLM server.
|
||||
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:
|
||||
|
||||
```bash
|
||||
docker run --runtime nvidia --gpus all \
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
docker run \
|
||||
--runtime nvidia \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
-p 8000:8000 \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct
|
||||
--model $INFERENCE_MODEL \
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
||||
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
|
||||
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||
|
||||
|
||||
#### Start Llama Stack server pointing to your vLLM server
|
||||
|
||||
|
||||
We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
|
||||
```yaml
|
||||
inference:
|
||||
- provider_id: vllm0
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://127.0.0.1:8000
|
||||
```
|
||||
|
||||
**Via Conda**
|
||||
|
||||
If you are using Conda, you can build and run the Llama Stack server with the following commands:
|
||||
```bash
|
||||
cd distributions/remote-vllm
|
||||
llama stack build --template remote-vllm --image-type conda
|
||||
llama stack run run.yaml
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
|
||||
docker run \
|
||||
--runtime nvidia \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model $SAFETY_MODEL \
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
**Via Docker**
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
You can use the Llama Stack Docker image to start the server with the following command:
|
||||
```bash
|
||||
docker run --network host -it -p 5000:5000 \
|
||||
-v ~/.llama:/root/.llama \
|
||||
-v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
|
||||
--gpus=all \
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-remote-vllm \
|
||||
--yaml_config /root/llamastack-run-remote-vllm.yaml
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-remote-vllm \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env VLLM_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
|
||||
```
|
||||
|
||||
|
||||
### Via Conda
|
||||
|
||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
```bash
|
||||
llama stack build --template remote-vllm --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
llama stack run ./run-with-safety.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env VLLM_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
||||
```
|
||||
|
|
|
@ -29,13 +29,13 @@ The following environment variables can be configured:
|
|||
Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:
|
||||
|
||||
```bash
|
||||
export TGI_INFERENCE_PORT=8080
|
||||
export INFERENCE_PORT=8080
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
docker run --rm -it \
|
||||
-v $HOME/.cache/huggingface:/data \
|
||||
-p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \
|
||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
ghcr.io/huggingface/text-generation-inference:2.3.1 \
|
||||
--dtype bfloat16 \
|
||||
|
@ -43,29 +43,29 @@ docker run --rm -it \
|
|||
--sharded false \
|
||||
--cuda-memory-fraction 0.7 \
|
||||
--model-id $INFERENCE_MODEL \
|
||||
--port $TGI_INFERENCE_PORT
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||
|
||||
```bash
|
||||
export TGI_SAFETY_PORT=8081
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
|
||||
docker run --rm -it \
|
||||
-v $HOME/.cache/huggingface:/data \
|
||||
-p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \
|
||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
ghcr.io/huggingface/text-generation-inference:2.3.1 \
|
||||
--dtype bfloat16 \
|
||||
--usage-stats off \
|
||||
--sharded false \
|
||||
--model-id $SAFETY_MODEL \
|
||||
--port $TGI_SAFETY_PORT
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
## Running Llama Stack with TGI as the inference provider
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
|
@ -76,7 +76,6 @@ This method allows you to get started quickly without having to build the distri
|
|||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
--network host \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
|
@ -84,14 +83,13 @@ docker run \
|
|||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT
|
||||
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
--network host \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
||||
|
@ -99,9 +97,9 @@ docker run \
|
|||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \
|
||||
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT
|
||||
--env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
@ -113,7 +111,7 @@ llama stack build --template tgi --image-type conda
|
|||
llama stack run ./run.yaml
|
||||
--port 5001
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
--env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
|
||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
@ -122,7 +120,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
|||
llama stack run ./run-with-safety.yaml
|
||||
--port 5001
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
--env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
|
||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL
|
||||
--env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT
|
||||
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
||||
```
|
||||
|
|
|
@ -1,62 +1,67 @@
|
|||
# Together Distribution
|
||||
|
||||
### Connect to a Llama Stack Together Endpoint
|
||||
- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution
|
||||
# Fireworks Distribution
|
||||
|
||||
The `llamastack/distribution-together` distribution consists of the following provider configurations.
|
||||
|
||||
|
||||
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|
||||
|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- |
|
||||
| **Provider(s)** | remote::together | meta-reference | meta-reference, remote::weaviate | meta-reference | meta-reference |
|
||||
| API | Provider(s) |
|
||||
|-----|-------------|
|
||||
| agents | `inline::meta-reference` |
|
||||
| inference | `remote::together` |
|
||||
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||
| safety | `inline::llama-guard` |
|
||||
| telemetry | `inline::meta-reference` |
|
||||
|
||||
|
||||
### Docker: Start the Distribution (Single Node CPU)
|
||||
### Environment Variables
|
||||
|
||||
> [!NOTE]
|
||||
> This assumes you have an hosted endpoint at Together with API Key.
|
||||
The following environment variables can be configured:
|
||||
|
||||
```
|
||||
$ cd distributions/together && docker compose up
|
||||
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
||||
- `TOGETHER_API_KEY`: Together.AI API Key (default: ``)
|
||||
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
- `meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo`
|
||||
- `meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo`
|
||||
- `meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo`
|
||||
- `meta-llama/Llama-3.2-3B-Instruct-Turbo`
|
||||
- `meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo`
|
||||
- `meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo`
|
||||
- `meta-llama/Meta-Llama-Guard-3-8B`
|
||||
- `meta-llama/Llama-Guard-3-11B-Vision-Turbo`
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/).
|
||||
|
||||
|
||||
## Running Llama Stack with Together
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-together \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||
```
|
||||
|
||||
Make sure in your `run.yaml` file, your inference provider is pointing to the correct Together URL server endpoint. E.g.
|
||||
```
|
||||
inference:
|
||||
- provider_id: together
|
||||
provider_type: remote::together
|
||||
config:
|
||||
url: https://api.together.xyz/v1
|
||||
api_key: <optional api key>
|
||||
```
|
||||
|
||||
### Conda llama stack run (Single Node CPU)
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template together --image-type conda
|
||||
# -- modify run.yaml to a valid Together server endpoint
|
||||
llama stack run ./run.yaml
|
||||
```
|
||||
|
||||
### (Optional) Update Model Serving Configuration
|
||||
|
||||
Use `llama-stack-client models list` to check the available models served by together.
|
||||
|
||||
```
|
||||
$ llama-stack-client models list
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| identifier | llama_model | provider_id | metadata |
|
||||
+==============================+==============================+===============+============+
|
||||
| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} |
|
||||
+------------------------------+------------------------------+---------------+------------+
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||
```
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Optional
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel, Field
|
||||
|
@ -20,3 +20,10 @@ class FireworksImplConfig(BaseModel):
|
|||
default=None,
|
||||
description="The Fireworks.ai API Key",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(cls) -> Dict[str, Any]:
|
||||
return {
|
||||
"url": "https://api.fireworks.ai/inference",
|
||||
"api_key": "${env.FIREWORKS_API_KEY}",
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
|
|||
from .config import FireworksImplConfig
|
||||
|
||||
|
||||
model_aliases = [
|
||||
MODEL_ALIASES = [
|
||||
build_model_alias(
|
||||
"fireworks/llama-v3p1-8b-instruct",
|
||||
CoreModelId.llama3_1_8b_instruct.value,
|
||||
|
@ -79,7 +79,7 @@ class FireworksInferenceAdapter(
|
|||
ModelRegistryHelper, Inference, NeedsRequestProviderData
|
||||
):
|
||||
def __init__(self, config: FireworksImplConfig) -> None:
|
||||
ModelRegistryHelper.__init__(self, model_aliases)
|
||||
ModelRegistryHelper.__init__(self, MODEL_ALIASES)
|
||||
self.config = config
|
||||
self.formatter = ChatFormat(Tokenizer.get_instance())
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from typing import Optional
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from llama_models.schema_utils import json_schema_type
|
||||
from pydantic import BaseModel, Field
|
||||
|
@ -20,3 +20,10 @@ class TogetherImplConfig(BaseModel):
|
|||
default=None,
|
||||
description="The Together AI API Key",
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def sample_run_config(cls) -> Dict[str, Any]:
|
||||
return {
|
||||
"url": "https://api.together.xyz/v1",
|
||||
"api_key": "${env.TOGETHER_API_KEY}",
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import (
|
|||
from .config import TogetherImplConfig
|
||||
|
||||
|
||||
model_aliases = [
|
||||
MODEL_ALIASES = [
|
||||
build_model_alias(
|
||||
"meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
|
||||
CoreModelId.llama3_1_8b_instruct.value,
|
||||
|
@ -78,7 +78,7 @@ class TogetherInferenceAdapter(
|
|||
ModelRegistryHelper, Inference, NeedsRequestProviderData
|
||||
):
|
||||
def __init__(self, config: TogetherImplConfig) -> None:
|
||||
ModelRegistryHelper.__init__(self, model_aliases)
|
||||
ModelRegistryHelper.__init__(self, MODEL_ALIASES)
|
||||
self.config = config
|
||||
self.formatter = ChatFormat(Tokenizer.get_instance())
|
||||
|
||||
|
|
7
llama_stack/templates/fireworks/__init__.py
Normal file
7
llama_stack/templates/fireworks/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .fireworks import get_distribution_template # noqa: F401
|
|
@ -1,11 +1,19 @@
|
|||
version: '2'
|
||||
name: fireworks
|
||||
distribution_spec:
|
||||
description: Use Fireworks.ai for running LLM inference
|
||||
description: Use Fireworks.AI for running LLM inference
|
||||
docker_image: null
|
||||
providers:
|
||||
inference: remote::fireworks
|
||||
inference:
|
||||
- remote::fireworks
|
||||
memory:
|
||||
- inline::faiss
|
||||
- remote::weaviate
|
||||
safety: inline::llama-guard
|
||||
agents: inline::meta-reference
|
||||
telemetry: inline::meta-reference
|
||||
- remote::chromadb
|
||||
- remote::pgvector
|
||||
safety:
|
||||
- inline::llama-guard
|
||||
agents:
|
||||
- inline::meta-reference
|
||||
telemetry:
|
||||
- inline::meta-reference
|
||||
image_type: conda
|
||||
|
|
60
llama_stack/templates/fireworks/doc_template.md
Normal file
60
llama_stack/templates/fireworks/doc_template.md
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Fireworks Distribution
|
||||
|
||||
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
||||
|
||||
{{ providers_table }}
|
||||
|
||||
{% if run_config_env_vars %}
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
||||
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
{% if default_models %}
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
{% for model in default_models %}
|
||||
- `{{ model.model_id }}`
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/).
|
||||
|
||||
|
||||
## Running Llama Stack with Fireworks
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-{{ name }} \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template fireworks --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||
```
|
60
llama_stack/templates/fireworks/fireworks.py
Normal file
60
llama_stack/templates/fireworks/fireworks.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig
|
||||
from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES
|
||||
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
||||
def get_distribution_template() -> DistributionTemplate:
|
||||
providers = {
|
||||
"inference": ["remote::fireworks"],
|
||||
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||
"safety": ["inline::llama-guard"],
|
||||
"agents": ["inline::meta-reference"],
|
||||
"telemetry": ["inline::meta-reference"],
|
||||
}
|
||||
|
||||
inference_provider = Provider(
|
||||
provider_id="fireworks",
|
||||
provider_type="remote::fireworks",
|
||||
config=FireworksImplConfig.sample_run_config(),
|
||||
)
|
||||
|
||||
default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES]
|
||||
|
||||
return DistributionTemplate(
|
||||
name="fireworks",
|
||||
distro_type="self_hosted",
|
||||
description="Use Fireworks.AI for running LLM inference",
|
||||
docker_image=None,
|
||||
template_path=Path(__file__).parent / "doc_template.md",
|
||||
providers=providers,
|
||||
default_models=default_models,
|
||||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
},
|
||||
default_models=default_models,
|
||||
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
|
||||
),
|
||||
},
|
||||
run_config_env_vars={
|
||||
"LLAMASTACK_PORT": (
|
||||
"5001",
|
||||
"Port for the Llama Stack distribution server",
|
||||
),
|
||||
"FIREWORKS_API_KEY": (
|
||||
"",
|
||||
"Fireworks.AI API Key",
|
||||
),
|
||||
},
|
||||
)
|
|
@ -6,103 +6,106 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
|
|||
|
||||
You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
|
||||
|
||||
{%- if docker_compose_env_vars %}
|
||||
{%- if run_config_env_vars %}
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
{% for var, (default_value, description) in docker_compose_env_vars.items() %}
|
||||
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
||||
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
{%- if default_models %}
|
||||
### Models
|
||||
|
||||
The following models are configured by default:
|
||||
{% for model in default_models %}
|
||||
- `{{ model.model_id }}`
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
## Setting up Ollama server
|
||||
|
||||
## Using Docker Compose
|
||||
Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server.
|
||||
|
||||
You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command.
|
||||
In order to load models, you can run:
|
||||
|
||||
```bash
|
||||
$ cd distributions/{{ name }}; docker compose up
|
||||
export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
|
||||
|
||||
# ollama names this model differently, and we must use the ollama name when loading the model
|
||||
export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16"
|
||||
ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m
|
||||
```
|
||||
|
||||
You will see outputs similar to following ---
|
||||
If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model.
|
||||
|
||||
```bash
|
||||
[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps"
|
||||
[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps"
|
||||
INFO: Started server process [1]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit)
|
||||
[llamastack] | Resolved 12 providers
|
||||
[llamastack] | inner-inference => ollama0
|
||||
[llamastack] | models => __routing_table__
|
||||
[llamastack] | inference => __autorouted__
|
||||
export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B"
|
||||
|
||||
# ollama names this model differently, and we must use the ollama name when loading the model
|
||||
export OLLAMA_SAFETY_MODEL="llama-guard3:1b"
|
||||
ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m
|
||||
```
|
||||
|
||||
To kill the server
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
docker compose down
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
--gpus=all \
|
||||
llamastack/distribution-{{ name }} \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env OLLAMA_URL=http://host.docker.internal:11434
|
||||
```
|
||||
|
||||
## Starting Ollama and Llama Stack separately
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands.
|
||||
|
||||
#### Start Ollama server
|
||||
- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details.
|
||||
|
||||
**Via Docker**
|
||||
```bash
|
||||
docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ~/.llama:/root/.llama \
|
||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
||||
--gpus=all \
|
||||
llamastack/distribution-{{ name }} \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env OLLAMA_URL=http://host.docker.internal:11434
|
||||
```
|
||||
|
||||
**Via CLI**
|
||||
```bash
|
||||
ollama run <model_id>
|
||||
```
|
||||
### Via Conda
|
||||
|
||||
#### Start Llama Stack server pointing to Ollama server
|
||||
|
||||
**Via Conda**
|
||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
```bash
|
||||
llama stack build --template ollama --image-type conda
|
||||
llama stack run run.yaml
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env OLLAMA_URL=http://127.0.0.1:11434
|
||||
```
|
||||
|
||||
**Via Docker**
|
||||
```
|
||||
docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml
|
||||
```
|
||||
|
||||
Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g.
|
||||
```yaml
|
||||
inference:
|
||||
- provider_id: ollama0
|
||||
provider_type: remote::ollama
|
||||
config:
|
||||
url: http://127.0.0.1:14343
|
||||
```
|
||||
|
||||
### (Optional) Update Model Serving Configuration
|
||||
|
||||
#### Downloading model via Ollama
|
||||
|
||||
You can use ollama for managing model downloads.
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
ollama pull llama3.1:8b-instruct-fp16
|
||||
ollama pull llama3.1:70b-instruct-fp16
|
||||
llama stack run ./run-with-safety.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env OLLAMA_URL=http://127.0.0.1:11434
|
||||
```
|
||||
|
||||
|
||||
### (Optional) Update Model Serving Configuration
|
||||
|
||||
> [!NOTE]
|
||||
> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models.
|
||||
|
||||
|
|
|
@ -68,17 +68,17 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
"5001",
|
||||
"Port for the Llama Stack distribution server",
|
||||
),
|
||||
"OLLAMA_URL": (
|
||||
"http://127.0.0.1:11434",
|
||||
"URL of the Ollama server",
|
||||
),
|
||||
"INFERENCE_MODEL": (
|
||||
"meta-llama/Llama-3.2-3B-Instruct",
|
||||
"Inference model loaded into the TGI server",
|
||||
),
|
||||
"OLLAMA_URL": (
|
||||
"http://host.docker.internal:11434",
|
||||
"URL of the Ollama server",
|
||||
"Inference model loaded into the Ollama server",
|
||||
),
|
||||
"SAFETY_MODEL": (
|
||||
"meta-llama/Llama-Guard-3-1B",
|
||||
"Name of the safety (Llama-Guard) model to use",
|
||||
"Safety model loaded into the Ollama server",
|
||||
),
|
||||
},
|
||||
)
|
||||
|
|
|
@ -2,7 +2,7 @@ version: '2'
|
|||
name: remote-vllm
|
||||
distribution_spec:
|
||||
description: Use (an external) vLLM server for running LLM inference
|
||||
docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3
|
||||
docker_image: null
|
||||
providers:
|
||||
inference:
|
||||
- remote::vllm
|
||||
|
|
|
@ -6,90 +6,114 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
|
|||
|
||||
You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.
|
||||
|
||||
{%- if docker_compose_env_vars %}
|
||||
{% if run_config_env_vars %}
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
{% for var, (default_value, description) in docker_compose_env_vars.items() %}
|
||||
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
||||
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
{% if default_models %}
|
||||
### Models
|
||||
|
||||
The following models are configured by default:
|
||||
{% for model in default_models %}
|
||||
- `{{ model.model_id }}`
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
## Setting up vLLM server
|
||||
|
||||
## Using Docker Compose
|
||||
|
||||
You can use `docker compose` to start a vLLM container and Llama Stack server container together.
|
||||
```bash
|
||||
$ cd distributions/{{ name }}; docker compose up
|
||||
```
|
||||
|
||||
You will see outputs similar to following ---
|
||||
```
|
||||
<TO BE FILLED>
|
||||
```
|
||||
|
||||
To kill the server
|
||||
```bash
|
||||
docker compose down
|
||||
```
|
||||
|
||||
## Starting vLLM and Llama Stack separately
|
||||
|
||||
You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack.
|
||||
|
||||
#### Start vLLM server.
|
||||
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker:
|
||||
|
||||
```bash
|
||||
docker run --runtime nvidia --gpus all \
|
||||
export INFERENCE_PORT=8000
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
docker run \
|
||||
--runtime nvidia \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
||||
-p 8000:8000 \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model meta-llama/Llama-3.2-3B-Instruct
|
||||
--model $INFERENCE_MODEL \
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
||||
Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details.
|
||||
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||
|
||||
```bash
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
|
||||
docker run \
|
||||
--runtime nvidia \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
-v ~/.cache/huggingface:/root/.cache/huggingface \
|
||||
--env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
|
||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--ipc=host \
|
||||
vllm/vllm-openai:latest \
|
||||
--model $SAFETY_MODEL \
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-{{ name }} \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-{{ name }} \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env VLLM_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
|
||||
```
|
||||
|
||||
|
||||
### Via Conda
|
||||
|
||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||
|
||||
|
||||
#### Start Llama Stack server pointing to your vLLM server
|
||||
|
||||
|
||||
We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following:
|
||||
```yaml
|
||||
inference:
|
||||
- provider_id: vllm0
|
||||
provider_type: remote::vllm
|
||||
config:
|
||||
url: http://127.0.0.1:8000
|
||||
```
|
||||
|
||||
**Via Conda**
|
||||
|
||||
If you are using Conda, you can build and run the Llama Stack server with the following commands:
|
||||
```bash
|
||||
cd distributions/remote-vllm
|
||||
llama stack build --template remote-vllm --image-type conda
|
||||
llama stack run run.yaml
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
```
|
||||
|
||||
**Via Docker**
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
You can use the Llama Stack Docker image to start the server with the following command:
|
||||
```bash
|
||||
docker run --network host -it -p 5000:5000 \
|
||||
-v ~/.llama:/root/.llama \
|
||||
-v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \
|
||||
--gpus=all \
|
||||
llamastack/distribution-remote-vllm \
|
||||
--yaml_config /root/llamastack-run-remote-vllm.yaml
|
||||
llama stack run ./run-with-safety.yaml \
|
||||
--port 5001 \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env VLLM_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
||||
```
|
||||
|
|
|
@ -41,7 +41,6 @@ def get_distribution_template() -> DistributionTemplate:
|
|||
name="remote-vllm",
|
||||
distro_type="self_hosted",
|
||||
description="Use (an external) vLLM server for running LLM inference",
|
||||
docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3",
|
||||
template_path=Path(__file__).parent / "doc_template.md",
|
||||
providers=providers,
|
||||
default_models=[inference_model, safety_model],
|
||||
|
|
|
@ -22,13 +22,13 @@ The following environment variables can be configured:
|
|||
Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker:
|
||||
|
||||
```bash
|
||||
export TGI_INFERENCE_PORT=8080
|
||||
export INFERENCE_PORT=8080
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
docker run --rm -it \
|
||||
-v $HOME/.cache/huggingface:/data \
|
||||
-p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \
|
||||
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
ghcr.io/huggingface/text-generation-inference:2.3.1 \
|
||||
--dtype bfloat16 \
|
||||
|
@ -36,29 +36,29 @@ docker run --rm -it \
|
|||
--sharded false \
|
||||
--cuda-memory-fraction 0.7 \
|
||||
--model-id $INFERENCE_MODEL \
|
||||
--port $TGI_INFERENCE_PORT
|
||||
--port $INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
||||
|
||||
```bash
|
||||
export TGI_SAFETY_PORT=8081
|
||||
export SAFETY_PORT=8081
|
||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||
export CUDA_VISIBLE_DEVICES=1
|
||||
|
||||
docker run --rm -it \
|
||||
-v $HOME/.cache/huggingface:/data \
|
||||
-p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \
|
||||
-p $SAFETY_PORT:$SAFETY_PORT \
|
||||
--gpus $CUDA_VISIBLE_DEVICES \
|
||||
ghcr.io/huggingface/text-generation-inference:2.3.1 \
|
||||
--dtype bfloat16 \
|
||||
--usage-stats off \
|
||||
--sharded false \
|
||||
--model-id $SAFETY_MODEL \
|
||||
--port $TGI_SAFETY_PORT
|
||||
--port $SAFETY_PORT
|
||||
```
|
||||
|
||||
## Running Llama Stack with TGI as the inference provider
|
||||
## Running Llama Stack
|
||||
|
||||
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
|
@ -69,7 +69,6 @@ This method allows you to get started quickly without having to build the distri
|
|||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
--network host \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
|
@ -77,14 +76,13 @@ docker run \
|
|||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT
|
||||
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
||||
```bash
|
||||
docker run \
|
||||
--network host \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
||||
|
@ -92,9 +90,9 @@ docker run \
|
|||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||
--env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \
|
||||
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||
--env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT
|
||||
--env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
@ -106,7 +104,7 @@ llama stack build --template {{ name }} --image-type conda
|
|||
llama stack run ./run.yaml
|
||||
--port 5001
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
--env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
|
||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
```
|
||||
|
||||
If you are using Llama Stack Safety / Shield APIs, use:
|
||||
|
@ -115,7 +113,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
|||
llama stack run ./run-with-safety.yaml
|
||||
--port 5001
|
||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
||||
--env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT
|
||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||
--env SAFETY_MODEL=$SAFETY_MODEL
|
||||
--env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT
|
||||
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
||||
```
|
||||
|
|
7
llama_stack/templates/together/__init__.py
Normal file
7
llama_stack/templates/together/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from .together import get_distribution_template # noqa: F401
|
|
@ -1,11 +1,19 @@
|
|||
version: '2'
|
||||
name: together
|
||||
distribution_spec:
|
||||
description: Use Together.ai for running LLM inference
|
||||
description: Use Together.AI for running LLM inference
|
||||
docker_image: null
|
||||
providers:
|
||||
inference: remote::together
|
||||
inference:
|
||||
- remote::together
|
||||
memory:
|
||||
- inline::faiss
|
||||
- remote::weaviate
|
||||
safety: inline::llama-guard
|
||||
agents: inline::meta-reference
|
||||
telemetry: inline::meta-reference
|
||||
- remote::chromadb
|
||||
- remote::pgvector
|
||||
safety:
|
||||
- inline::llama-guard
|
||||
agents:
|
||||
- inline::meta-reference
|
||||
telemetry:
|
||||
- inline::meta-reference
|
||||
image_type: conda
|
||||
|
|
60
llama_stack/templates/together/doc_template.md
Normal file
60
llama_stack/templates/together/doc_template.md
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Fireworks Distribution
|
||||
|
||||
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
||||
|
||||
{{ providers_table }}
|
||||
|
||||
{% if run_config_env_vars %}
|
||||
### Environment Variables
|
||||
|
||||
The following environment variables can be configured:
|
||||
|
||||
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
||||
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
{% if default_models %}
|
||||
### Models
|
||||
|
||||
The following models are available by default:
|
||||
|
||||
{% for model in default_models %}
|
||||
- `{{ model.model_id }}`
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
|
||||
### Prerequisite: API Keys
|
||||
|
||||
Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/).
|
||||
|
||||
|
||||
## Running Llama Stack with Together
|
||||
|
||||
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||
|
||||
### Via Docker
|
||||
|
||||
This method allows you to get started quickly without having to build the distribution code.
|
||||
|
||||
```bash
|
||||
LLAMA_STACK_PORT=5001
|
||||
docker run \
|
||||
-it \
|
||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
llamastack/distribution-{{ name }} \
|
||||
/root/my-run.yaml \
|
||||
--port $LLAMA_STACK_PORT \
|
||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||
```
|
||||
|
||||
### Via Conda
|
||||
|
||||
```bash
|
||||
llama stack build --template together --image-type conda
|
||||
llama stack run ./run.yaml \
|
||||
--port 5001 \
|
||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||
```
|
60
llama_stack/templates/together/together.py
Normal file
60
llama_stack/templates/together/together.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
# All rights reserved.
|
||||
#
|
||||
# This source code is licensed under the terms described in the LICENSE file in
|
||||
# the root directory of this source tree.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||
from llama_stack.providers.remote.inference.together import TogetherImplConfig
|
||||
from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES
|
||||
|
||||
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||
|
||||
|
||||
def get_distribution_template() -> DistributionTemplate:
|
||||
providers = {
|
||||
"inference": ["remote::together"],
|
||||
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||
"safety": ["inline::llama-guard"],
|
||||
"agents": ["inline::meta-reference"],
|
||||
"telemetry": ["inline::meta-reference"],
|
||||
}
|
||||
|
||||
inference_provider = Provider(
|
||||
provider_id="together",
|
||||
provider_type="remote::together",
|
||||
config=TogetherImplConfig.sample_run_config(),
|
||||
)
|
||||
|
||||
default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES]
|
||||
|
||||
return DistributionTemplate(
|
||||
name="together",
|
||||
distro_type="self_hosted",
|
||||
description="Use Together.AI for running LLM inference",
|
||||
docker_image=None,
|
||||
template_path=Path(__file__).parent / "doc_template.md",
|
||||
providers=providers,
|
||||
default_models=default_models,
|
||||
run_configs={
|
||||
"run.yaml": RunConfigSettings(
|
||||
provider_overrides={
|
||||
"inference": [inference_provider],
|
||||
},
|
||||
default_models=default_models,
|
||||
default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-1B")],
|
||||
),
|
||||
},
|
||||
run_config_env_vars={
|
||||
"LLAMASTACK_PORT": (
|
||||
"5001",
|
||||
"Port for the Llama Stack distribution server",
|
||||
),
|
||||
"TOGETHER_API_KEY": (
|
||||
"",
|
||||
"Together.AI API Key",
|
||||
),
|
||||
},
|
||||
)
|
Loading…
Add table
Add a link
Reference in a new issue