mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
Update more distribution docs to be simpler and partially codegen'ed
This commit is contained in:
parent
e84d4436b5
commit
2411a44833
51 changed files with 1188 additions and 291 deletions
|
@ -1,45 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
name: bedrock
|
|
||||||
docker_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: bedrock0
|
|
||||||
provider_type: remote::bedrock
|
|
||||||
config:
|
|
||||||
aws_access_key_id: <AWS_ACCESS_KEY_ID>
|
|
||||||
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
|
|
||||||
aws_session_token: <AWS_SESSION_TOKEN>
|
|
||||||
region_name: <AWS_REGION>
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
||||||
safety:
|
|
||||||
- provider_id: bedrock0
|
|
||||||
provider_type: remote::bedrock
|
|
||||||
config:
|
|
||||||
aws_access_key_id: <AWS_ACCESS_KEY_ID>
|
|
||||||
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
|
|
||||||
aws_session_token: <AWS_SESSION_TOKEN>
|
|
||||||
region_name: <AWS_REGION>
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/kvstore.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
1
distributions/bedrock/run.yaml
Symbolic link
1
distributions/bedrock/run.yaml
Symbolic link
|
@ -0,0 +1 @@
|
||||||
|
../../llama_stack/templates/bedrock/run.yaml
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/databricks/build.yaml
|
|
|
@ -1,4 +1,32 @@
|
||||||
{
|
{
|
||||||
|
"hf-serverless": [
|
||||||
|
"aiohttp",
|
||||||
|
"aiosqlite",
|
||||||
|
"blobfile",
|
||||||
|
"chardet",
|
||||||
|
"chromadb-client",
|
||||||
|
"faiss-cpu",
|
||||||
|
"fastapi",
|
||||||
|
"fire",
|
||||||
|
"httpx",
|
||||||
|
"huggingface_hub",
|
||||||
|
"matplotlib",
|
||||||
|
"nltk",
|
||||||
|
"numpy",
|
||||||
|
"pandas",
|
||||||
|
"pillow",
|
||||||
|
"psycopg2-binary",
|
||||||
|
"pypdf",
|
||||||
|
"redis",
|
||||||
|
"scikit-learn",
|
||||||
|
"scipy",
|
||||||
|
"sentencepiece",
|
||||||
|
"tqdm",
|
||||||
|
"transformers",
|
||||||
|
"uvicorn",
|
||||||
|
"sentence-transformers --no-deps",
|
||||||
|
"torch --index-url https://download.pytorch.org/whl/cpu"
|
||||||
|
],
|
||||||
"together": [
|
"together": [
|
||||||
"aiosqlite",
|
"aiosqlite",
|
||||||
"blobfile",
|
"blobfile",
|
||||||
|
@ -26,6 +54,33 @@
|
||||||
"sentence-transformers --no-deps",
|
"sentence-transformers --no-deps",
|
||||||
"torch --index-url https://download.pytorch.org/whl/cpu"
|
"torch --index-url https://download.pytorch.org/whl/cpu"
|
||||||
],
|
],
|
||||||
|
"vllm-gpu": [
|
||||||
|
"aiosqlite",
|
||||||
|
"blobfile",
|
||||||
|
"chardet",
|
||||||
|
"chromadb-client",
|
||||||
|
"faiss-cpu",
|
||||||
|
"fastapi",
|
||||||
|
"fire",
|
||||||
|
"httpx",
|
||||||
|
"matplotlib",
|
||||||
|
"nltk",
|
||||||
|
"numpy",
|
||||||
|
"pandas",
|
||||||
|
"pillow",
|
||||||
|
"psycopg2-binary",
|
||||||
|
"pypdf",
|
||||||
|
"redis",
|
||||||
|
"scikit-learn",
|
||||||
|
"scipy",
|
||||||
|
"sentencepiece",
|
||||||
|
"tqdm",
|
||||||
|
"transformers",
|
||||||
|
"uvicorn",
|
||||||
|
"vllm",
|
||||||
|
"sentence-transformers --no-deps",
|
||||||
|
"torch --index-url https://download.pytorch.org/whl/cpu"
|
||||||
|
],
|
||||||
"remote-vllm": [
|
"remote-vllm": [
|
||||||
"aiosqlite",
|
"aiosqlite",
|
||||||
"blobfile",
|
"blobfile",
|
||||||
|
@ -108,6 +163,33 @@
|
||||||
"sentence-transformers --no-deps",
|
"sentence-transformers --no-deps",
|
||||||
"torch --index-url https://download.pytorch.org/whl/cpu"
|
"torch --index-url https://download.pytorch.org/whl/cpu"
|
||||||
],
|
],
|
||||||
|
"bedrock": [
|
||||||
|
"aiosqlite",
|
||||||
|
"blobfile",
|
||||||
|
"boto3",
|
||||||
|
"chardet",
|
||||||
|
"chromadb-client",
|
||||||
|
"faiss-cpu",
|
||||||
|
"fastapi",
|
||||||
|
"fire",
|
||||||
|
"httpx",
|
||||||
|
"matplotlib",
|
||||||
|
"nltk",
|
||||||
|
"numpy",
|
||||||
|
"pandas",
|
||||||
|
"pillow",
|
||||||
|
"psycopg2-binary",
|
||||||
|
"pypdf",
|
||||||
|
"redis",
|
||||||
|
"scikit-learn",
|
||||||
|
"scipy",
|
||||||
|
"sentencepiece",
|
||||||
|
"tqdm",
|
||||||
|
"transformers",
|
||||||
|
"uvicorn",
|
||||||
|
"sentence-transformers --no-deps",
|
||||||
|
"torch --index-url https://download.pytorch.org/whl/cpu"
|
||||||
|
],
|
||||||
"meta-reference-gpu": [
|
"meta-reference-gpu": [
|
||||||
"accelerate",
|
"accelerate",
|
||||||
"aiosqlite",
|
"aiosqlite",
|
||||||
|
@ -167,5 +249,33 @@
|
||||||
"uvicorn",
|
"uvicorn",
|
||||||
"sentence-transformers --no-deps",
|
"sentence-transformers --no-deps",
|
||||||
"torch --index-url https://download.pytorch.org/whl/cpu"
|
"torch --index-url https://download.pytorch.org/whl/cpu"
|
||||||
|
],
|
||||||
|
"hf-endpoint": [
|
||||||
|
"aiohttp",
|
||||||
|
"aiosqlite",
|
||||||
|
"blobfile",
|
||||||
|
"chardet",
|
||||||
|
"chromadb-client",
|
||||||
|
"faiss-cpu",
|
||||||
|
"fastapi",
|
||||||
|
"fire",
|
||||||
|
"httpx",
|
||||||
|
"huggingface_hub",
|
||||||
|
"matplotlib",
|
||||||
|
"nltk",
|
||||||
|
"numpy",
|
||||||
|
"pandas",
|
||||||
|
"pillow",
|
||||||
|
"psycopg2-binary",
|
||||||
|
"pypdf",
|
||||||
|
"redis",
|
||||||
|
"scikit-learn",
|
||||||
|
"scipy",
|
||||||
|
"sentencepiece",
|
||||||
|
"tqdm",
|
||||||
|
"transformers",
|
||||||
|
"uvicorn",
|
||||||
|
"sentence-transformers --no-deps",
|
||||||
|
"torch --index-url https://download.pytorch.org/whl/cpu"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/hf-endpoint/build.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/hf-serverless/build.yaml
|
|
|
@ -1 +0,0 @@
|
||||||
../../llama_stack/templates/ollama/build.yaml
|
|
|
@ -1,48 +0,0 @@
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
image: ollama/ollama:latest
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast
|
|
||||||
ports:
|
|
||||||
- "11434:11434"
|
|
||||||
devices:
|
|
||||||
- nvidia.com/gpu=all
|
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
command: []
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
# that's the closest analogue to --gpus; provide
|
|
||||||
# an integer amount of devices or 'all'
|
|
||||||
count: 1
|
|
||||||
# Devices are reserved using a list of capabilities, making
|
|
||||||
# capabilities the only required field. A device MUST
|
|
||||||
# satisfy all the requested capabilities for a successful
|
|
||||||
# reservation.
|
|
||||||
capabilities: [gpu]
|
|
||||||
runtime: nvidia
|
|
||||||
llamastack:
|
|
||||||
depends_on:
|
|
||||||
- ollama
|
|
||||||
image: llamastack/distribution-ollama
|
|
||||||
network_mode: "host"
|
|
||||||
volumes:
|
|
||||||
- ~/.llama:/root/.llama
|
|
||||||
# Link to ollama run.yaml file
|
|
||||||
- ./run.yaml:/root/llamastack-run-ollama.yaml
|
|
||||||
ports:
|
|
||||||
- "5000:5000"
|
|
||||||
# Hack: wait for ollama server to start before starting docker
|
|
||||||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml"
|
|
||||||
deploy:
|
|
||||||
restart_policy:
|
|
||||||
condition: on-failure
|
|
||||||
delay: 3s
|
|
||||||
max_attempts: 5
|
|
||||||
window: 60s
|
|
||||||
volumes:
|
|
||||||
ollama:
|
|
|
@ -1,46 +0,0 @@
|
||||||
version: '2'
|
|
||||||
image_name: local
|
|
||||||
docker_image: null
|
|
||||||
conda_env: local
|
|
||||||
apis:
|
|
||||||
- shields
|
|
||||||
- agents
|
|
||||||
- models
|
|
||||||
- memory
|
|
||||||
- memory_banks
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: ollama
|
|
||||||
provider_type: remote::ollama
|
|
||||||
config:
|
|
||||||
url: ${env.OLLAMA_URL:http://127.0.0.1:11434}
|
|
||||||
safety:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
excluded_categories: []
|
|
||||||
memory:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
||||||
agents:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
namespace: null
|
|
||||||
type: sqlite
|
|
||||||
db_path: ~/.llama/runtime/kvstore.db
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta0
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config: {}
|
|
||||||
models:
|
|
||||||
- model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
|
|
||||||
provider_id: ollama
|
|
||||||
- model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
|
|
||||||
provider_id: ollama
|
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B}
|
|
|
@ -6,59 +6,58 @@
|
||||||
self
|
self
|
||||||
```
|
```
|
||||||
|
|
||||||
### Connect to a Llama Stack Bedrock Endpoint
|
The `llamastack/distribution-bedrock` distribution consists of the following provider configurations:
|
||||||
- You may connect to Amazon Bedrock APIs for running LLM inference
|
|
||||||
|
|
||||||
The `llamastack/distribution-bedrock` distribution consists of the following provider configurations.
|
| API | Provider(s) |
|
||||||
|
|-----|-------------|
|
||||||
|
| agents | `inline::meta-reference` |
|
||||||
|
| inference | `remote::bedrock` |
|
||||||
|
| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
|
||||||
|
| safety | `remote::bedrock` |
|
||||||
|
| telemetry | `inline::meta-reference` |
|
||||||
|
|
||||||
|
|
||||||
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|
|
||||||
|----------------- |--------------- |---------------- |---------------- |---------------- |---------------- |
|
### Environment Variables
|
||||||
| **Provider(s)** | remote::bedrock | meta-reference | meta-reference | remote::bedrock | meta-reference |
|
|
||||||
|
The following environment variables can be configured:
|
||||||
|
|
||||||
|
- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`)
|
||||||
|
|
||||||
|
|
||||||
### Docker: Start the Distribution (Single Node CPU)
|
|
||||||
|
|
||||||
> [!NOTE]
|
### Prerequisite: API Keys
|
||||||
> This assumes you have valid AWS credentials configured with access to Amazon Bedrock.
|
|
||||||
|
|
||||||
```
|
Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
|
||||||
$ cd distributions/bedrock && docker compose up
|
|
||||||
|
|
||||||
|
## Running Llama Stack with AWS Bedrock
|
||||||
|
|
||||||
|
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
|
|
||||||
|
### Via Docker
|
||||||
|
|
||||||
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLAMA_STACK_PORT=5001
|
||||||
|
docker run \
|
||||||
|
-it \
|
||||||
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
llamastack/distribution-bedrock \
|
||||||
|
--port $LLAMA_STACK_PORT \
|
||||||
|
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||||
|
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||||
|
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
|
||||||
```
|
```
|
||||||
|
|
||||||
Make sure in your `run.yaml` file, your inference provider is pointing to the correct AWS configuration. E.g.
|
### Via Conda
|
||||||
```
|
|
||||||
inference:
|
|
||||||
- provider_id: bedrock0
|
|
||||||
provider_type: remote::bedrock
|
|
||||||
config:
|
|
||||||
aws_access_key_id: <AWS_ACCESS_KEY_ID>
|
|
||||||
aws_secret_access_key: <AWS_SECRET_ACCESS_KEY>
|
|
||||||
aws_session_token: <AWS_SESSION_TOKEN>
|
|
||||||
region_name: <AWS_REGION>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Conda llama stack run (Single Node CPU)
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template bedrock --image-type conda
|
llama stack build --template bedrock --image-type conda
|
||||||
# -- modify run.yaml with valid AWS credentials
|
llama stack run ./run.yaml \
|
||||||
llama stack run ./run.yaml
|
--port $LLAMA_STACK_PORT \
|
||||||
```
|
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||||
|
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||||
### (Optional) Update Model Serving Configuration
|
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
|
||||||
|
|
||||||
Use `llama-stack-client models list` to check the available models served by Amazon Bedrock.
|
|
||||||
|
|
||||||
```
|
|
||||||
$ llama-stack-client models list
|
|
||||||
+------------------------------+------------------------------+---------------+------------+
|
|
||||||
| identifier | llama_model | provider_id | metadata |
|
|
||||||
+==============================+==============================+===============+============+
|
|
||||||
| Llama3.1-8B-Instruct | meta.llama3-1-8b-instruct-v1:0 | bedrock0 | {} |
|
|
||||||
+------------------------------+------------------------------+---------------+------------+
|
|
||||||
| Llama3.1-70B-Instruct | meta.llama3-1-70b-instruct-v1:0 | bedrock0 | {} |
|
|
||||||
+------------------------------+------------------------------+---------------+------------+
|
|
||||||
| Llama3.1-405B-Instruct | meta.llama3-1-405b-instruct-v1:0 | bedrock0 | {} |
|
|
||||||
+------------------------------+------------------------------+---------------+------------+
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -58,9 +58,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-fireworks \
|
llamastack/distribution-fireworks \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||||
```
|
```
|
||||||
|
@ -70,6 +68,6 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template fireworks --image-type conda
|
llama stack build --template fireworks --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||||
```
|
```
|
||||||
|
|
|
@ -54,9 +54,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-meta-reference-gpu \
|
llamastack/distribution-meta-reference-gpu \
|
||||||
/root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
```
|
```
|
||||||
|
@ -67,9 +65,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-meta-reference-gpu \
|
llamastack/distribution-meta-reference-gpu \
|
||||||
/root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
@ -81,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template meta-reference-gpu --image-type conda
|
llama stack build --template meta-reference-gpu --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run distributions/meta-reference-gpu/run.yaml \
|
||||||
--port 5001 \
|
--port 5001 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
```
|
```
|
||||||
|
@ -89,7 +85,7 @@ llama stack run ./run.yaml \
|
||||||
If you are using Llama Stack Safety / Shield APIs, use:
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run ./run-with-safety.yaml \
|
llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \
|
||||||
--port 5001 \
|
--port 5001 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
|
@ -66,9 +66,7 @@ docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ~/.llama:/root/.llama \
|
-v ~/.llama:/root/.llama \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-ollama \
|
llamastack/distribution-ollama \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env OLLAMA_URL=http://host.docker.internal:11434
|
--env OLLAMA_URL=http://host.docker.internal:11434
|
||||||
|
|
|
@ -85,9 +85,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-tgi \
|
llamastack/distribution-tgi \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
|
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
|
||||||
|
@ -116,18 +114,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template tgi --image-type conda
|
llama stack build --template tgi --image-type conda
|
||||||
llama stack run ./run.yaml
|
llama stack run ./run.yaml
|
||||||
--port 5001
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||||
```
|
```
|
||||||
|
|
||||||
If you are using Llama Stack Safety / Shield APIs, use:
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run ./run-with-safety.yaml
|
llama stack run ./run-with-safety.yaml \
|
||||||
--port 5001
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||||
--env SAFETY_MODEL=$SAFETY_MODEL
|
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||||
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
||||||
```
|
```
|
||||||
|
|
|
@ -57,9 +57,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-together \
|
llamastack/distribution-together \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||||
```
|
```
|
||||||
|
@ -69,6 +67,6 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template together --image-type conda
|
llama stack build --template together --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||||
```
|
```
|
||||||
|
|
|
@ -37,11 +37,11 @@ class VLLMConfig(BaseModel):
|
||||||
@classmethod
|
@classmethod
|
||||||
def sample_run_config(cls):
|
def sample_run_config(cls):
|
||||||
return {
|
return {
|
||||||
"model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}",
|
"model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}",
|
||||||
"tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}",
|
"tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}",
|
||||||
"max_tokens": "${env.VLLM_MAX_TOKENS:4096}",
|
"max_tokens": "${env.MAX_TOKENS:4096}",
|
||||||
"enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}",
|
"enforce_eager": "${env.ENFORCE_EAGER:False}",
|
||||||
"gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}",
|
"gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}",
|
||||||
}
|
}
|
||||||
|
|
||||||
@field_validator("model")
|
@field_validator("model")
|
||||||
|
|
|
@ -4,11 +4,8 @@
|
||||||
# This source code is licensed under the terms described in the LICENSE file in
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
|
|
||||||
from llama_models.schema_utils import json_schema_type
|
|
||||||
|
|
||||||
from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
|
from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class BedrockConfig(BedrockBaseConfig):
|
class BedrockConfig(BedrockBaseConfig):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel):
|
||||||
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
|
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def sample_run_config(
|
||||||
|
cls,
|
||||||
|
endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}",
|
||||||
|
api_token: str = "${env.HF_API_TOKEN}",
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
return {
|
||||||
|
"endpoint_name": endpoint_name,
|
||||||
|
"api_token": api_token,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class InferenceAPIImplConfig(BaseModel):
|
class InferenceAPIImplConfig(BaseModel):
|
||||||
|
@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel):
|
||||||
default=None,
|
default=None,
|
||||||
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
|
description="Your Hugging Face user access token (will default to locally saved token if not provided)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def sample_run_config(
|
||||||
|
cls,
|
||||||
|
repo: str = "${env.INFERENCE_MODEL}",
|
||||||
|
api_token: str = "${env.HF_API_TOKEN}",
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
return {
|
||||||
|
"huggingface_repo": repo,
|
||||||
|
"api_token": api_token,
|
||||||
|
}
|
||||||
|
|
|
@ -5,11 +5,9 @@
|
||||||
# the root directory of this source tree.
|
# the root directory of this source tree.
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from llama_models.schema_utils import json_schema_type
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
|
||||||
class BedrockBaseConfig(BaseModel):
|
class BedrockBaseConfig(BaseModel):
|
||||||
aws_access_key_id: Optional[str] = Field(
|
aws_access_key_id: Optional[str] = Field(
|
||||||
default=None,
|
default=None,
|
||||||
|
@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel):
|
||||||
default=3600,
|
default=3600,
|
||||||
description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
|
description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def sample_run_config(cls, **kwargs):
|
||||||
|
return {}
|
||||||
|
|
7
llama_stack/templates/bedrock/__init__.py
Normal file
7
llama_stack/templates/bedrock/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .bedrock import get_distribution_template # noqa: F401
|
38
llama_stack/templates/bedrock/bedrock.py
Normal file
38
llama_stack/templates/bedrock/bedrock.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||||
|
|
||||||
|
|
||||||
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
providers = {
|
||||||
|
"inference": ["remote::bedrock"],
|
||||||
|
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||||
|
"safety": ["remote::bedrock"],
|
||||||
|
"agents": ["inline::meta-reference"],
|
||||||
|
"telemetry": ["inline::meta-reference"],
|
||||||
|
}
|
||||||
|
|
||||||
|
return DistributionTemplate(
|
||||||
|
name="bedrock",
|
||||||
|
distro_type="self_hosted",
|
||||||
|
description="Use AWS Bedrock for running LLM inference and safety",
|
||||||
|
docker_image=None,
|
||||||
|
template_path=Path(__file__).parent / "doc_template.md",
|
||||||
|
providers=providers,
|
||||||
|
default_models=[],
|
||||||
|
run_configs={
|
||||||
|
"run.yaml": RunConfigSettings(),
|
||||||
|
},
|
||||||
|
run_config_env_vars={
|
||||||
|
"LLAMASTACK_PORT": (
|
||||||
|
"5001",
|
||||||
|
"Port for the Llama Stack distribution server",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
|
@ -1,9 +1,19 @@
|
||||||
|
version: '2'
|
||||||
name: bedrock
|
name: bedrock
|
||||||
distribution_spec:
|
distribution_spec:
|
||||||
description: Use Amazon Bedrock APIs.
|
description: Use AWS Bedrock for running LLM inference and safety
|
||||||
|
docker_image: null
|
||||||
providers:
|
providers:
|
||||||
inference: remote::bedrock
|
inference:
|
||||||
memory: inline::faiss
|
- remote::bedrock
|
||||||
safety: inline::llama-guard
|
memory:
|
||||||
agents: inline::meta-reference
|
- inline::faiss
|
||||||
telemetry: inline::meta-reference
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety:
|
||||||
|
- remote::bedrock
|
||||||
|
agents:
|
||||||
|
- inline::meta-reference
|
||||||
|
telemetry:
|
||||||
|
- inline::meta-reference
|
||||||
|
image_type: conda
|
||||||
|
|
63
llama_stack/templates/bedrock/doc_template.md
Normal file
63
llama_stack/templates/bedrock/doc_template.md
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
# Bedrock Distribution
|
||||||
|
|
||||||
|
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
|
||||||
|
|
||||||
|
{{ providers_table }}
|
||||||
|
|
||||||
|
|
||||||
|
{% if run_config_env_vars %}
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
The following environment variables can be configured:
|
||||||
|
|
||||||
|
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
||||||
|
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if default_models %}
|
||||||
|
### Models
|
||||||
|
|
||||||
|
The following models are available by default:
|
||||||
|
|
||||||
|
{% for model in default_models %}
|
||||||
|
- `{{ model.model_id }} ({{ model.provider_model_id }})`
|
||||||
|
{% endfor %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
|
||||||
|
### Prerequisite: API Keys
|
||||||
|
|
||||||
|
Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/).
|
||||||
|
|
||||||
|
|
||||||
|
## Running Llama Stack with AWS Bedrock
|
||||||
|
|
||||||
|
You can do this via Conda (build code) or Docker which has a pre-built image.
|
||||||
|
|
||||||
|
### Via Docker
|
||||||
|
|
||||||
|
This method allows you to get started quickly without having to build the distribution code.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLAMA_STACK_PORT=5001
|
||||||
|
docker run \
|
||||||
|
-it \
|
||||||
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
|
llamastack/distribution-{{ name }} \
|
||||||
|
--port $LLAMA_STACK_PORT \
|
||||||
|
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||||
|
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||||
|
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
|
||||||
|
```
|
||||||
|
|
||||||
|
### Via Conda
|
||||||
|
|
||||||
|
```bash
|
||||||
|
llama stack build --template {{ name }} --image-type conda
|
||||||
|
llama stack run ./run.yaml \
|
||||||
|
--port $LLAMA_STACK_PORT \
|
||||||
|
--env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \
|
||||||
|
--env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \
|
||||||
|
--env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
|
||||||
|
```
|
49
llama_stack/templates/bedrock/run.yaml
Normal file
49
llama_stack/templates/bedrock/run.yaml
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
version: '2'
|
||||||
|
image_name: bedrock
|
||||||
|
docker_image: null
|
||||||
|
conda_env: bedrock
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- inference
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- telemetry
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: bedrock
|
||||||
|
provider_type: remote::bedrock
|
||||||
|
config: {}
|
||||||
|
memory:
|
||||||
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db
|
||||||
|
safety:
|
||||||
|
- provider_id: bedrock
|
||||||
|
provider_type: remote::bedrock
|
||||||
|
config: {}
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db
|
||||||
|
models: []
|
||||||
|
shields: []
|
||||||
|
memory_banks: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
eval_tasks: []
|
|
@ -1,9 +0,0 @@
|
||||||
name: databricks
|
|
||||||
distribution_spec:
|
|
||||||
description: Use Databricks for running LLM inference
|
|
||||||
providers:
|
|
||||||
inference: remote::databricks
|
|
||||||
memory: inline::faiss
|
|
||||||
safety: inline::llama-guard
|
|
||||||
agents: meta-reference
|
|
||||||
telemetry: meta-reference
|
|
|
@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-{{ name }} \
|
llamastack/distribution-{{ name }} \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||||
```
|
```
|
||||||
|
@ -55,6 +53,6 @@ docker run \
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template fireworks --image-type conda
|
llama stack build --template fireworks --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
--env FIREWORKS_API_KEY=$FIREWORKS_API_KEY
|
||||||
```
|
```
|
||||||
|
|
7
llama_stack/templates/hf-endpoint/__init__.py
Normal file
7
llama_stack/templates/hf-endpoint/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .hf_endpoint import get_distribution_template # noqa: F401
|
|
@ -1,9 +1,19 @@
|
||||||
|
version: '2'
|
||||||
name: hf-endpoint
|
name: hf-endpoint
|
||||||
distribution_spec:
|
distribution_spec:
|
||||||
description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints."
|
description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
|
||||||
|
docker_image: null
|
||||||
providers:
|
providers:
|
||||||
inference: remote::hf::endpoint
|
inference:
|
||||||
memory: inline::faiss
|
- remote::hf::endpoint
|
||||||
safety: inline::llama-guard
|
memory:
|
||||||
agents: inline::meta-reference
|
- inline::faiss
|
||||||
telemetry: inline::meta-reference
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety:
|
||||||
|
- inline::llama-guard
|
||||||
|
agents:
|
||||||
|
- inline::meta-reference
|
||||||
|
telemetry:
|
||||||
|
- inline::meta-reference
|
||||||
|
image_type: conda
|
||||||
|
|
97
llama_stack/templates/hf-endpoint/hf_endpoint.py
Normal file
97
llama_stack/templates/hf-endpoint/hf_endpoint.py
Normal file
|
@ -0,0 +1,97 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||||
|
from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig
|
||||||
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||||
|
|
||||||
|
|
||||||
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
providers = {
|
||||||
|
"inference": ["remote::hf::endpoint"],
|
||||||
|
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||||
|
"safety": ["inline::llama-guard"],
|
||||||
|
"agents": ["inline::meta-reference"],
|
||||||
|
"telemetry": ["inline::meta-reference"],
|
||||||
|
}
|
||||||
|
|
||||||
|
inference_provider = Provider(
|
||||||
|
provider_id="hf-endpoint",
|
||||||
|
provider_type="remote::hf::endpoint",
|
||||||
|
config=InferenceEndpointImplConfig.sample_run_config(),
|
||||||
|
)
|
||||||
|
|
||||||
|
inference_model = ModelInput(
|
||||||
|
model_id="${env.INFERENCE_MODEL}",
|
||||||
|
provider_id="hf-endpoint",
|
||||||
|
)
|
||||||
|
safety_model = ModelInput(
|
||||||
|
model_id="${env.SAFETY_MODEL}",
|
||||||
|
provider_id="hf-endpoint-safety",
|
||||||
|
)
|
||||||
|
|
||||||
|
return DistributionTemplate(
|
||||||
|
name="hf-endpoint",
|
||||||
|
distro_type="self_hosted",
|
||||||
|
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
|
||||||
|
docker_image=None,
|
||||||
|
template_path=None,
|
||||||
|
providers=providers,
|
||||||
|
default_models=[inference_model, safety_model],
|
||||||
|
run_configs={
|
||||||
|
"run.yaml": RunConfigSettings(
|
||||||
|
provider_overrides={
|
||||||
|
"inference": [inference_provider],
|
||||||
|
},
|
||||||
|
default_models=[inference_model],
|
||||||
|
),
|
||||||
|
"run-with-safety.yaml": RunConfigSettings(
|
||||||
|
provider_overrides={
|
||||||
|
"inference": [
|
||||||
|
inference_provider,
|
||||||
|
Provider(
|
||||||
|
provider_id="hf-endpoint-safety",
|
||||||
|
provider_type="remote::hf::endpoint",
|
||||||
|
config=InferenceEndpointImplConfig.sample_run_config(
|
||||||
|
endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
default_models=[
|
||||||
|
inference_model,
|
||||||
|
safety_model,
|
||||||
|
],
|
||||||
|
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
run_config_env_vars={
|
||||||
|
"LLAMASTACK_PORT": (
|
||||||
|
"5001",
|
||||||
|
"Port for the Llama Stack distribution server",
|
||||||
|
),
|
||||||
|
"HF_API_TOKEN": (
|
||||||
|
"hf_...",
|
||||||
|
"Hugging Face API token",
|
||||||
|
),
|
||||||
|
"INFERENCE_ENDPOINT_NAME": (
|
||||||
|
"",
|
||||||
|
"HF Inference endpoint name for the main inference model",
|
||||||
|
),
|
||||||
|
"SAFETY_INFERENCE_ENDPOINT_NAME": (
|
||||||
|
"",
|
||||||
|
"HF Inference endpoint for the safety model",
|
||||||
|
),
|
||||||
|
"INFERENCE_MODEL": (
|
||||||
|
"meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"Inference model served by the HF Inference Endpoint",
|
||||||
|
),
|
||||||
|
"SAFETY_MODEL": (
|
||||||
|
"meta-llama/Llama-Guard-3-1B",
|
||||||
|
"Safety model served by the HF Inference Endpoint",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
68
llama_stack/templates/hf-endpoint/run-with-safety.yaml
Normal file
68
llama_stack/templates/hf-endpoint/run-with-safety.yaml
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
version: '2'
|
||||||
|
image_name: hf-endpoint
|
||||||
|
docker_image: null
|
||||||
|
conda_env: hf-endpoint
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- inference
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- telemetry
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: hf-endpoint
|
||||||
|
provider_type: remote::hf::endpoint
|
||||||
|
config:
|
||||||
|
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
|
||||||
|
api_token: ${env.HF_API_TOKEN}
|
||||||
|
- provider_id: hf-endpoint-safety
|
||||||
|
provider_type: remote::hf::endpoint
|
||||||
|
config:
|
||||||
|
endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME}
|
||||||
|
api_token: ${env.HF_API_TOKEN}
|
||||||
|
memory:
|
||||||
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config: {}
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
|
||||||
|
models:
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: hf-endpoint
|
||||||
|
provider_model_id: null
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.SAFETY_MODEL}
|
||||||
|
provider_id: hf-endpoint-safety
|
||||||
|
provider_model_id: null
|
||||||
|
shields:
|
||||||
|
- params: null
|
||||||
|
shield_id: ${env.SAFETY_MODEL}
|
||||||
|
provider_id: null
|
||||||
|
provider_shield_id: null
|
||||||
|
memory_banks: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
eval_tasks: []
|
55
llama_stack/templates/hf-endpoint/run.yaml
Normal file
55
llama_stack/templates/hf-endpoint/run.yaml
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
version: '2'
|
||||||
|
image_name: hf-endpoint
|
||||||
|
docker_image: null
|
||||||
|
conda_env: hf-endpoint
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- inference
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- telemetry
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: hf-endpoint
|
||||||
|
provider_type: remote::hf::endpoint
|
||||||
|
config:
|
||||||
|
endpoint_name: ${env.INFERENCE_ENDPOINT_NAME}
|
||||||
|
api_token: ${env.HF_API_TOKEN}
|
||||||
|
memory:
|
||||||
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config: {}
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db
|
||||||
|
models:
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: hf-endpoint
|
||||||
|
provider_model_id: null
|
||||||
|
shields: []
|
||||||
|
memory_banks: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
eval_tasks: []
|
7
llama_stack/templates/hf-serverless/__init__.py
Normal file
7
llama_stack/templates/hf-serverless/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .hf_serverless import get_distribution_template # noqa: F401
|
|
@ -1,9 +1,19 @@
|
||||||
|
version: '2'
|
||||||
name: hf-serverless
|
name: hf-serverless
|
||||||
distribution_spec:
|
distribution_spec:
|
||||||
description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference."
|
description: Use (an external) Hugging Face Inference Endpoint for running LLM inference
|
||||||
|
docker_image: null
|
||||||
providers:
|
providers:
|
||||||
inference: remote::hf::serverless
|
inference:
|
||||||
memory: inline::faiss
|
- remote::hf::serverless
|
||||||
safety: inline::llama-guard
|
memory:
|
||||||
agents: inline::meta-reference
|
- inline::faiss
|
||||||
telemetry: inline::meta-reference
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety:
|
||||||
|
- inline::llama-guard
|
||||||
|
agents:
|
||||||
|
- inline::meta-reference
|
||||||
|
telemetry:
|
||||||
|
- inline::meta-reference
|
||||||
|
image_type: conda
|
||||||
|
|
89
llama_stack/templates/hf-serverless/hf_serverless.py
Normal file
89
llama_stack/templates/hf-serverless/hf_serverless.py
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||||
|
from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig
|
||||||
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||||
|
|
||||||
|
|
||||||
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
providers = {
|
||||||
|
"inference": ["remote::hf::serverless"],
|
||||||
|
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||||
|
"safety": ["inline::llama-guard"],
|
||||||
|
"agents": ["inline::meta-reference"],
|
||||||
|
"telemetry": ["inline::meta-reference"],
|
||||||
|
}
|
||||||
|
|
||||||
|
inference_provider = Provider(
|
||||||
|
provider_id="hf-serverless",
|
||||||
|
provider_type="remote::hf::serverless",
|
||||||
|
config=InferenceAPIImplConfig.sample_run_config(),
|
||||||
|
)
|
||||||
|
|
||||||
|
inference_model = ModelInput(
|
||||||
|
model_id="${env.INFERENCE_MODEL}",
|
||||||
|
provider_id="hf-serverless",
|
||||||
|
)
|
||||||
|
safety_model = ModelInput(
|
||||||
|
model_id="${env.SAFETY_MODEL}",
|
||||||
|
provider_id="hf-serverless-safety",
|
||||||
|
)
|
||||||
|
|
||||||
|
return DistributionTemplate(
|
||||||
|
name="hf-serverless",
|
||||||
|
distro_type="self_hosted",
|
||||||
|
description="Use (an external) Hugging Face Inference Endpoint for running LLM inference",
|
||||||
|
docker_image=None,
|
||||||
|
template_path=None,
|
||||||
|
providers=providers,
|
||||||
|
default_models=[inference_model, safety_model],
|
||||||
|
run_configs={
|
||||||
|
"run.yaml": RunConfigSettings(
|
||||||
|
provider_overrides={
|
||||||
|
"inference": [inference_provider],
|
||||||
|
},
|
||||||
|
default_models=[inference_model],
|
||||||
|
),
|
||||||
|
"run-with-safety.yaml": RunConfigSettings(
|
||||||
|
provider_overrides={
|
||||||
|
"inference": [
|
||||||
|
inference_provider,
|
||||||
|
Provider(
|
||||||
|
provider_id="hf-serverless-safety",
|
||||||
|
provider_type="remote::hf::serverless",
|
||||||
|
config=InferenceAPIImplConfig.sample_run_config(
|
||||||
|
repo="${env.SAFETY_MODEL}",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
},
|
||||||
|
default_models=[
|
||||||
|
inference_model,
|
||||||
|
safety_model,
|
||||||
|
],
|
||||||
|
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
run_config_env_vars={
|
||||||
|
"LLAMASTACK_PORT": (
|
||||||
|
"5001",
|
||||||
|
"Port for the Llama Stack distribution server",
|
||||||
|
),
|
||||||
|
"HF_API_TOKEN": (
|
||||||
|
"hf_...",
|
||||||
|
"Hugging Face API token",
|
||||||
|
),
|
||||||
|
"INFERENCE_MODEL": (
|
||||||
|
"meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"Inference model to be served by the HF Serverless endpoint",
|
||||||
|
),
|
||||||
|
"SAFETY_MODEL": (
|
||||||
|
"meta-llama/Llama-Guard-3-1B",
|
||||||
|
"Safety model to be served by the HF Serverless endpoint",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
68
llama_stack/templates/hf-serverless/run-with-safety.yaml
Normal file
68
llama_stack/templates/hf-serverless/run-with-safety.yaml
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
version: '2'
|
||||||
|
image_name: hf-serverless
|
||||||
|
docker_image: null
|
||||||
|
conda_env: hf-serverless
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- inference
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- telemetry
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: hf-serverless
|
||||||
|
provider_type: remote::hf::serverless
|
||||||
|
config:
|
||||||
|
huggingface_repo: ${env.INFERENCE_MODEL}
|
||||||
|
api_token: ${env.HF_API_TOKEN}
|
||||||
|
- provider_id: hf-serverless-safety
|
||||||
|
provider_type: remote::hf::serverless
|
||||||
|
config:
|
||||||
|
huggingface_repo: ${env.SAFETY_MODEL}
|
||||||
|
api_token: ${env.HF_API_TOKEN}
|
||||||
|
memory:
|
||||||
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config: {}
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
|
||||||
|
models:
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: hf-serverless
|
||||||
|
provider_model_id: null
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.SAFETY_MODEL}
|
||||||
|
provider_id: hf-serverless-safety
|
||||||
|
provider_model_id: null
|
||||||
|
shields:
|
||||||
|
- params: null
|
||||||
|
shield_id: ${env.SAFETY_MODEL}
|
||||||
|
provider_id: null
|
||||||
|
provider_shield_id: null
|
||||||
|
memory_banks: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
eval_tasks: []
|
55
llama_stack/templates/hf-serverless/run.yaml
Normal file
55
llama_stack/templates/hf-serverless/run.yaml
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
version: '2'
|
||||||
|
image_name: hf-serverless
|
||||||
|
docker_image: null
|
||||||
|
conda_env: hf-serverless
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- inference
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- telemetry
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: hf-serverless
|
||||||
|
provider_type: remote::hf::serverless
|
||||||
|
config:
|
||||||
|
huggingface_repo: ${env.INFERENCE_MODEL}
|
||||||
|
api_token: ${env.HF_API_TOKEN}
|
||||||
|
memory:
|
||||||
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config: {}
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db
|
||||||
|
models:
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: hf-serverless
|
||||||
|
provider_model_id: null
|
||||||
|
shields: []
|
||||||
|
memory_banks: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
eval_tasks: []
|
|
@ -1,13 +0,0 @@
|
||||||
name: meta-reference-gpu
|
|
||||||
distribution_spec:
|
|
||||||
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
|
|
||||||
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
|
||||||
providers:
|
|
||||||
inference: inline::meta-reference
|
|
||||||
memory:
|
|
||||||
- inline::faiss
|
|
||||||
- remote::chromadb
|
|
||||||
- remote::pgvector
|
|
||||||
safety: inline::llama-guard
|
|
||||||
agents: inline::meta-reference
|
|
||||||
telemetry: inline::meta-reference
|
|
|
@ -40,9 +40,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-{{ name }} \
|
llamastack/distribution-{{ name }} \
|
||||||
/root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
```
|
```
|
||||||
|
@ -53,9 +51,7 @@ If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run-with-safety.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-{{ name }} \
|
llamastack/distribution-{{ name }} \
|
||||||
/root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
@ -66,8 +62,8 @@ docker run \
|
||||||
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template meta-reference-gpu --image-type conda
|
llama stack build --template {{ name }} --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run distributions/{{ name }}/run.yaml \
|
||||||
--port 5001 \
|
--port 5001 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
```
|
```
|
||||||
|
@ -75,7 +71,7 @@ llama stack run ./run.yaml \
|
||||||
If you are using Llama Stack Safety / Shield APIs, use:
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run ./run-with-safety.yaml \
|
llama stack run distributions/{{ name }}/run-with-safety.yaml \
|
||||||
--port 5001 \
|
--port 5001 \
|
||||||
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
--env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
--env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .meta_reference import get_distribution_template # noqa: F401
|
|
@ -0,0 +1,54 @@
|
||||||
|
# Meta Reference Quantized Distribution
|
||||||
|
|
||||||
|
The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
|
||||||
|
|
||||||
|
|
||||||
|
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|
||||||
|
|----------------- |------------------------ |---------------- |-------------------------------------------------- |---------------- |---------------- |
|
||||||
|
| **Provider(s)** | meta-reference-quantized | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference |
|
||||||
|
|
||||||
|
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
|
||||||
|
|
||||||
|
### Step 0. Prerequisite - Downloading Models
|
||||||
|
Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models.
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ls ~/.llama/checkpoints
|
||||||
|
Llama3.2-3B-Instruct:int4-qlora-eo8
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 1. Start the Distribution
|
||||||
|
#### (Option 1) Start with Docker
|
||||||
|
```
|
||||||
|
$ cd distributions/meta-reference-quantized-gpu && docker compose up
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> This assumes you have access to GPU to start a local server with access to your GPU.
|
||||||
|
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> `~/.llama` should be the path containing downloaded weights of Llama models.
|
||||||
|
|
||||||
|
|
||||||
|
This will download and start running a pre-built docker container. Alternatively, you may use the following commands:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
#### (Option 2) Start with Conda
|
||||||
|
|
||||||
|
1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html)
|
||||||
|
|
||||||
|
2. Build the `meta-reference-quantized-gpu` distribution
|
||||||
|
|
||||||
|
```
|
||||||
|
$ llama stack build --template meta-reference-quantized-gpu --image-type conda
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Start running distribution
|
||||||
|
```
|
||||||
|
$ cd distributions/meta-reference-quantized-gpu
|
||||||
|
$ llama stack run ./run.yaml
|
||||||
|
```
|
|
@ -0,0 +1,100 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput
|
||||||
|
from llama_stack.providers.inline.inference.meta_reference import (
|
||||||
|
MetaReferenceInferenceConfig,
|
||||||
|
)
|
||||||
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||||
|
|
||||||
|
|
||||||
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
providers = {
|
||||||
|
"inference": ["inline::meta-reference"],
|
||||||
|
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||||
|
"safety": ["inline::llama-guard"],
|
||||||
|
"agents": ["inline::meta-reference"],
|
||||||
|
"telemetry": ["inline::meta-reference"],
|
||||||
|
}
|
||||||
|
|
||||||
|
inference_provider = Provider(
|
||||||
|
provider_id="meta-reference-inference",
|
||||||
|
provider_type="inline::meta-reference",
|
||||||
|
config=MetaReferenceInferenceConfig.sample_run_config(
|
||||||
|
model="${env.INFERENCE_MODEL}",
|
||||||
|
checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
inference_model = ModelInput(
|
||||||
|
model_id="${env.INFERENCE_MODEL}",
|
||||||
|
provider_id="meta-reference-inference",
|
||||||
|
)
|
||||||
|
safety_model = ModelInput(
|
||||||
|
model_id="${env.SAFETY_MODEL}",
|
||||||
|
provider_id="meta-reference-safety",
|
||||||
|
)
|
||||||
|
|
||||||
|
return DistributionTemplate(
|
||||||
|
name="meta-reference-gpu",
|
||||||
|
distro_type="self_hosted",
|
||||||
|
description="Use Meta Reference for running LLM inference",
|
||||||
|
template_path=Path(__file__).parent / "doc_template.md",
|
||||||
|
providers=providers,
|
||||||
|
default_models=[inference_model, safety_model],
|
||||||
|
run_configs={
|
||||||
|
"run.yaml": RunConfigSettings(
|
||||||
|
provider_overrides={
|
||||||
|
"inference": [inference_provider],
|
||||||
|
},
|
||||||
|
default_models=[inference_model],
|
||||||
|
),
|
||||||
|
"run-with-safety.yaml": RunConfigSettings(
|
||||||
|
provider_overrides={
|
||||||
|
"inference": [
|
||||||
|
inference_provider,
|
||||||
|
Provider(
|
||||||
|
provider_id="meta-reference-safety",
|
||||||
|
provider_type="inline::meta-reference",
|
||||||
|
config=MetaReferenceInferenceConfig.sample_run_config(
|
||||||
|
model="${env.SAFETY_MODEL}",
|
||||||
|
checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
},
|
||||||
|
default_models=[
|
||||||
|
inference_model,
|
||||||
|
safety_model,
|
||||||
|
],
|
||||||
|
default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
run_config_env_vars={
|
||||||
|
"LLAMASTACK_PORT": (
|
||||||
|
"5001",
|
||||||
|
"Port for the Llama Stack distribution server",
|
||||||
|
),
|
||||||
|
"INFERENCE_MODEL": (
|
||||||
|
"meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"Inference model loaded into the Meta Reference server",
|
||||||
|
),
|
||||||
|
"INFERENCE_CHECKPOINT_DIR": (
|
||||||
|
"null",
|
||||||
|
"Directory containing the Meta Reference model checkpoint",
|
||||||
|
),
|
||||||
|
"SAFETY_MODEL": (
|
||||||
|
"meta-llama/Llama-Guard-3-1B",
|
||||||
|
"Name of the safety (Llama-Guard) model to use",
|
||||||
|
),
|
||||||
|
"SAFETY_CHECKPOINT_DIR": (
|
||||||
|
"null",
|
||||||
|
"Directory containing the Llama-Guard model checkpoint",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
|
@ -55,9 +55,7 @@ docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ~/.llama:/root/.llama \
|
-v ~/.llama:/root/.llama \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-{{ name }} \
|
llamastack/distribution-{{ name }} \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env OLLAMA_URL=http://host.docker.internal:11434
|
--env OLLAMA_URL=http://host.docker.internal:11434
|
||||||
|
@ -86,7 +84,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
|
||||||
```bash
|
```bash
|
||||||
export LLAMA_STACK_PORT=5001
|
export LLAMA_STACK_PORT=5001
|
||||||
|
|
||||||
llama stack build --template ollama --image-type conda
|
llama stack build --template {{ name }} --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
|
|
|
@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig
|
||||||
|
|
||||||
class RunConfigSettings(BaseModel):
|
class RunConfigSettings(BaseModel):
|
||||||
provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
|
provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict)
|
||||||
default_models: List[ModelInput]
|
default_models: Optional[List[ModelInput]] = None
|
||||||
default_shields: Optional[List[ShieldInput]] = None
|
default_shields: Optional[List[ShieldInput]] = None
|
||||||
|
|
||||||
def run_config(
|
def run_config(
|
||||||
|
@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel):
|
||||||
__distro_dir__=f"distributions/{name}",
|
__distro_dir__=f"distributions/{name}",
|
||||||
db_name="registry.db",
|
db_name="registry.db",
|
||||||
),
|
),
|
||||||
models=self.default_models,
|
models=self.default_models or [],
|
||||||
shields=self.default_shields or [],
|
shields=self.default_shields or [],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel):
|
||||||
|
|
||||||
providers: Dict[str, List[str]]
|
providers: Dict[str, List[str]]
|
||||||
run_configs: Dict[str, RunConfigSettings]
|
run_configs: Dict[str, RunConfigSettings]
|
||||||
template_path: Path
|
template_path: Optional[Path] = None
|
||||||
|
|
||||||
# Optional configuration
|
# Optional configuration
|
||||||
run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
|
run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None
|
||||||
|
@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel):
|
||||||
with open(yaml_output_dir / yaml_pth, "w") as f:
|
with open(yaml_output_dir / yaml_pth, "w") as f:
|
||||||
yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
|
yaml.safe_dump(run_config.model_dump(), f, sort_keys=False)
|
||||||
|
|
||||||
docs = self.generate_markdown_docs()
|
if self.template_path:
|
||||||
with open(doc_output_dir / f"{self.name}.md", "w") as f:
|
docs = self.generate_markdown_docs()
|
||||||
f.write(docs if docs.endswith("\n") else docs + "\n")
|
with open(doc_output_dir / f"{self.name}.md", "w") as f:
|
||||||
|
f.write(docs if docs.endswith("\n") else docs + "\n")
|
||||||
|
|
|
@ -71,9 +71,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-{{ name }} \
|
llamastack/distribution-{{ name }} \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
|
--env TGI_URL=http://host.docker.internal:$INFERENCE_PORT
|
||||||
|
@ -102,18 +100,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template {{ name }} --image-type conda
|
llama stack build --template {{ name }} --image-type conda
|
||||||
llama stack run ./run.yaml
|
llama stack run ./run.yaml
|
||||||
--port 5001
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
||||||
```
|
```
|
||||||
|
|
||||||
If you are using Llama Stack Safety / Shield APIs, use:
|
If you are using Llama Stack Safety / Shield APIs, use:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack run ./run-with-safety.yaml
|
llama stack run ./run-with-safety.yaml \
|
||||||
--port 5001
|
--port $LLAMA_STACK_PORT \
|
||||||
--env INFERENCE_MODEL=$INFERENCE_MODEL
|
--env INFERENCE_MODEL=$INFERENCE_MODEL \
|
||||||
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT
|
--env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \
|
||||||
--env SAFETY_MODEL=$SAFETY_MODEL
|
--env SAFETY_MODEL=$SAFETY_MODEL \
|
||||||
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
--env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT
|
||||||
```
|
```
|
||||||
|
|
|
@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001
|
||||||
docker run \
|
docker run \
|
||||||
-it \
|
-it \
|
||||||
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
||||||
-v ./run.yaml:/root/my-run.yaml \
|
|
||||||
llamastack/distribution-{{ name }} \
|
llamastack/distribution-{{ name }} \
|
||||||
--yaml-config /root/my-run.yaml \
|
|
||||||
--port $LLAMA_STACK_PORT \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||||
```
|
```
|
||||||
|
@ -53,8 +51,8 @@ docker run \
|
||||||
### Via Conda
|
### Via Conda
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
llama stack build --template together --image-type conda
|
llama stack build --template {{ name }} --image-type conda
|
||||||
llama stack run ./run.yaml \
|
llama stack run ./run.yaml \
|
||||||
--port 5001 \
|
--port $LLAMA_STACK_PORT \
|
||||||
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
--env TOGETHER_API_KEY=$TOGETHER_API_KEY
|
||||||
```
|
```
|
||||||
|
|
7
llama_stack/templates/vllm-gpu/__init__.py
Normal file
7
llama_stack/templates/vllm-gpu/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from .vllm import get_distribution_template # noqa: F401
|
19
llama_stack/templates/vllm-gpu/build.yaml
Normal file
19
llama_stack/templates/vllm-gpu/build.yaml
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
version: '2'
|
||||||
|
name: vllm-gpu
|
||||||
|
distribution_spec:
|
||||||
|
description: Use a built-in vLLM engine for running LLM inference
|
||||||
|
docker_image: null
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- inline::vllm
|
||||||
|
memory:
|
||||||
|
- inline::faiss
|
||||||
|
- remote::chromadb
|
||||||
|
- remote::pgvector
|
||||||
|
safety:
|
||||||
|
- inline::llama-guard
|
||||||
|
agents:
|
||||||
|
- inline::meta-reference
|
||||||
|
telemetry:
|
||||||
|
- inline::meta-reference
|
||||||
|
image_type: conda
|
58
llama_stack/templates/vllm-gpu/run.yaml
Normal file
58
llama_stack/templates/vllm-gpu/run.yaml
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
version: '2'
|
||||||
|
image_name: vllm-gpu
|
||||||
|
docker_image: null
|
||||||
|
conda_env: vllm-gpu
|
||||||
|
apis:
|
||||||
|
- agents
|
||||||
|
- inference
|
||||||
|
- memory
|
||||||
|
- safety
|
||||||
|
- telemetry
|
||||||
|
providers:
|
||||||
|
inference:
|
||||||
|
- provider_id: vllm
|
||||||
|
provider_type: inline::vllm
|
||||||
|
config:
|
||||||
|
model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
|
||||||
|
tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
|
||||||
|
max_tokens: ${env.MAX_TOKENS:4096}
|
||||||
|
enforce_eager: ${env.ENFORCE_EAGER:False}
|
||||||
|
gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
|
||||||
|
memory:
|
||||||
|
- provider_id: faiss
|
||||||
|
provider_type: inline::faiss
|
||||||
|
config:
|
||||||
|
kvstore:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
|
||||||
|
safety:
|
||||||
|
- provider_id: llama-guard
|
||||||
|
provider_type: inline::llama-guard
|
||||||
|
config: {}
|
||||||
|
agents:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config:
|
||||||
|
persistence_store:
|
||||||
|
type: sqlite
|
||||||
|
namespace: null
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
|
||||||
|
telemetry:
|
||||||
|
- provider_id: meta-reference
|
||||||
|
provider_type: inline::meta-reference
|
||||||
|
config: {}
|
||||||
|
metadata_store:
|
||||||
|
namespace: null
|
||||||
|
type: sqlite
|
||||||
|
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
|
||||||
|
models:
|
||||||
|
- metadata: {}
|
||||||
|
model_id: ${env.INFERENCE_MODEL}
|
||||||
|
provider_id: vllm
|
||||||
|
provider_model_id: null
|
||||||
|
shields: []
|
||||||
|
memory_banks: []
|
||||||
|
datasets: []
|
||||||
|
scoring_fns: []
|
||||||
|
eval_tasks: []
|
74
llama_stack/templates/vllm-gpu/vllm.py
Normal file
74
llama_stack/templates/vllm-gpu/vllm.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.distribution.datatypes import ModelInput, Provider
|
||||||
|
from llama_stack.providers.inline.inference.vllm import VLLMConfig
|
||||||
|
from llama_stack.templates.template import DistributionTemplate, RunConfigSettings
|
||||||
|
|
||||||
|
|
||||||
|
def get_distribution_template() -> DistributionTemplate:
|
||||||
|
providers = {
|
||||||
|
"inference": ["inline::vllm"],
|
||||||
|
"memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
|
||||||
|
"safety": ["inline::llama-guard"],
|
||||||
|
"agents": ["inline::meta-reference"],
|
||||||
|
"telemetry": ["inline::meta-reference"],
|
||||||
|
}
|
||||||
|
|
||||||
|
inference_provider = Provider(
|
||||||
|
provider_id="vllm",
|
||||||
|
provider_type="inline::vllm",
|
||||||
|
config=VLLMConfig.sample_run_config(),
|
||||||
|
)
|
||||||
|
|
||||||
|
inference_model = ModelInput(
|
||||||
|
model_id="${env.INFERENCE_MODEL}",
|
||||||
|
provider_id="vllm",
|
||||||
|
)
|
||||||
|
|
||||||
|
return DistributionTemplate(
|
||||||
|
name="vllm-gpu",
|
||||||
|
distro_type="self_hosted",
|
||||||
|
description="Use a built-in vLLM engine for running LLM inference",
|
||||||
|
docker_image=None,
|
||||||
|
template_path=None,
|
||||||
|
providers=providers,
|
||||||
|
default_models=[inference_model],
|
||||||
|
run_configs={
|
||||||
|
"run.yaml": RunConfigSettings(
|
||||||
|
provider_overrides={
|
||||||
|
"inference": [inference_provider],
|
||||||
|
},
|
||||||
|
default_models=[inference_model],
|
||||||
|
),
|
||||||
|
},
|
||||||
|
run_config_env_vars={
|
||||||
|
"LLAMASTACK_PORT": (
|
||||||
|
"5001",
|
||||||
|
"Port for the Llama Stack distribution server",
|
||||||
|
),
|
||||||
|
"INFERENCE_MODEL": (
|
||||||
|
"meta-llama/Llama-3.2-3B-Instruct",
|
||||||
|
"Inference model loaded into the vLLM engine",
|
||||||
|
),
|
||||||
|
"TENSOR_PARALLEL_SIZE": (
|
||||||
|
"1",
|
||||||
|
"Number of tensor parallel replicas (number of GPUs to use).",
|
||||||
|
),
|
||||||
|
"MAX_TOKENS": (
|
||||||
|
"4096",
|
||||||
|
"Maximum number of tokens to generate.",
|
||||||
|
),
|
||||||
|
"ENFORCE_EAGER": (
|
||||||
|
"False",
|
||||||
|
"Whether to use eager mode for inference (otherwise cuda graphs are used).",
|
||||||
|
),
|
||||||
|
"GPU_MEMORY_UTILIZATION": (
|
||||||
|
"0.7",
|
||||||
|
"GPU memory utilization for the vLLM engine.",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
Loading…
Add table
Add a link
Reference in a new issue