forked from phoenix-oss/llama-stack-mirror
Add a meta-reference-quantized-gpu distribution
This commit is contained in:
parent
f5dcc03742
commit
05a8d47b98
6 changed files with 104 additions and 4 deletions
|
@ -7,6 +7,7 @@ A Distribution is where APIs and Providers are assembled together to provide a c
|
|||
| **Distribution** | **Llama Stack Docker** | Start This Distribution | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|
||||
|:----------------: |:------------------------------------------: |:-----------------------: |:------------------: |:------------------: |:------------------: |:------------------: |:------------------: |
|
||||
| Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](./meta-reference-gpu/) | meta-reference | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
|
||||
| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](./meta-reference-quantized-gpu/) | meta-reference-quantized | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
|
||||
| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](./ollama/) | remote::ollama | meta-reference | remote::pgvector; remote::chromadb | remote::ollama | meta-reference |
|
||||
| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](./tgi/) | remote::tgi | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference |
|
||||
| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](./together/) | remote::together | meta-reference | remote::weaviate | meta-reference | meta-reference |
|
||||
|
|
34
distributions/meta-reference-quantized-gpu/README.md
Normal file
34
distributions/meta-reference-quantized-gpu/README.md
Normal file
|
@ -0,0 +1,34 @@
|
|||
# Meta Reference Quantized Distribution
|
||||
|
||||
The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations.
|
||||
|
||||
|
||||
| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** |
|
||||
|----------------- |------------------------ |---------------- |-------------------------------------------------- |---------------- |---------------- |
|
||||
| **Provider(s)** | meta-reference-quantized | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference |
|
||||
|
||||
The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc.
|
||||
|
||||
### Start the Distribution (Single Node GPU)
|
||||
|
||||
> [!NOTE]
|
||||
> This assumes you have access to GPU to start a local server with access to your GPU.
|
||||
|
||||
|
||||
> [!NOTE]
|
||||
> `~/.llama` should be the path containing downloaded weights of Llama models.
|
||||
|
||||
|
||||
To download and start running a pre-built docker container, you may use the following commands:
|
||||
|
||||
```
|
||||
docker run -it -p 5000:5000 -v ~/.llama:/root/.llama \
|
||||
-v ./run.yaml:/root/my-run.yaml \
|
||||
--gpus=all \
|
||||
distribution-meta-reference-quantized-gpu \
|
||||
--yaml_config /root/my-run.yaml
|
||||
```
|
||||
|
||||
### Alternative (Build and start distribution locally via conda)
|
||||
|
||||
- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on building locally via conda and starting up the distribution.
|
14
distributions/meta-reference-quantized-gpu/build.yaml
Normal file
14
distributions/meta-reference-quantized-gpu/build.yaml
Normal file
|
@ -0,0 +1,14 @@
|
|||
name: meta-reference-quantized-gpu
|
||||
distribution_spec:
|
||||
docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
|
||||
description: Use code from `llama_stack` itself to serve all llama stack APIs
|
||||
providers:
|
||||
inference: meta-reference-quantized
|
||||
memory:
|
||||
- meta-reference
|
||||
- remote::chromadb
|
||||
- remote::pgvector
|
||||
safety: meta-reference
|
||||
agents: meta-reference
|
||||
telemetry: meta-reference
|
||||
image_type: docker
|
51
distributions/meta-reference-quantized-gpu/run.yaml
Normal file
51
distributions/meta-reference-quantized-gpu/run.yaml
Normal file
|
@ -0,0 +1,51 @@
|
|||
version: '2'
|
||||
built_at: '2024-10-08T17:40:45.325529'
|
||||
image_name: local
|
||||
docker_image: null
|
||||
conda_env: local
|
||||
apis:
|
||||
- shields
|
||||
- agents
|
||||
- models
|
||||
- memory
|
||||
- memory_banks
|
||||
- inference
|
||||
- safety
|
||||
providers:
|
||||
inference:
|
||||
- provider_id: meta0
|
||||
provider_type: meta-reference-quantized
|
||||
config:
|
||||
model: Llama3.2-3B-Instruct
|
||||
quantization:
|
||||
type: fp8
|
||||
torch_seed: null
|
||||
max_seq_len: 2048
|
||||
max_batch_size: 1
|
||||
safety:
|
||||
- provider_id: meta0
|
||||
provider_type: meta-reference
|
||||
config:
|
||||
llama_guard_shield:
|
||||
model: Llama-Guard-3-1B
|
||||
excluded_categories: []
|
||||
disable_input_check: false
|
||||
disable_output_check: false
|
||||
prompt_guard_shield:
|
||||
model: Prompt-Guard-86M
|
||||
memory:
|
||||
- provider_id: meta0
|
||||
provider_type: meta-reference
|
||||
config: {}
|
||||
agents:
|
||||
- provider_id: meta0
|
||||
provider_type: meta-reference
|
||||
config:
|
||||
persistence_store:
|
||||
namespace: null
|
||||
type: sqlite
|
||||
db_path: ~/.llama/runtime/kvstore.db
|
||||
telemetry:
|
||||
- provider_id: meta0
|
||||
provider_type: meta-reference
|
||||
config: {}
|
|
@ -97,7 +97,7 @@ if [ -n "$pip_dependencies" ]; then
|
|||
fi
|
||||
|
||||
if [ -n "$special_pip_deps" ]; then
|
||||
IFS='#' read -ra parts <<< "$special_pip_deps"
|
||||
IFS='#' read -ra parts <<<"$special_pip_deps"
|
||||
for part in "${parts[@]}"; do
|
||||
add_to_docker "RUN pip install $part"
|
||||
done
|
||||
|
@ -127,7 +127,7 @@ if [ -n "$LLAMA_MODELS_DIR" ]; then
|
|||
mounts="$mounts -v $(readlink -f $LLAMA_MODELS_DIR):$models_mount"
|
||||
fi
|
||||
|
||||
if command -v selinuxenabled &> /dev/null && selinuxenabled; then
|
||||
if command -v selinuxenabled &>/dev/null && selinuxenabled; then
|
||||
# Disable SELinux labels -- we don't want to relabel the llama-stack source dir
|
||||
DOCKER_OPTS="$DOCKER_OPTS --security-opt label=disable"
|
||||
fi
|
||||
|
@ -139,4 +139,4 @@ $DOCKER_BINARY build $DOCKER_OPTS -t $image_name -f "$TEMP_DIR/Dockerfile" "$REP
|
|||
rm -rf $REPO_CONFIGS_DIR
|
||||
set +x
|
||||
|
||||
echo "Success! You can run it with: $DOCKER_BINARY $DOCKER_OPTS run -p 5000:5000 $image_name"
|
||||
echo "Success!"
|
||||
|
|
|
@ -36,7 +36,7 @@ def available_providers() -> List[ProviderSpec]:
|
|||
pip_packages=(
|
||||
META_REFERENCE_DEPS
|
||||
+ [
|
||||
"fbgemm-gpu==0.8.0",
|
||||
"fbgemm-gpu",
|
||||
]
|
||||
),
|
||||
module="llama_stack.providers.impls.meta_reference.inference",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue