diff --git a/distributions/README.md b/distributions/README.md index dc1e3cc25..4dc2b9d03 100644 --- a/distributions/README.md +++ b/distributions/README.md @@ -7,6 +7,7 @@ A Distribution is where APIs and Providers are assembled together to provide a c | **Distribution** | **Llama Stack Docker** | Start This Distribution | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | |:----------------: |:------------------------------------------: |:-----------------------: |:------------------: |:------------------: |:------------------: |:------------------: |:------------------: | | Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](./meta-reference-gpu/) | meta-reference | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference | +| Meta Reference Quantized | [llamastack/distribution-meta-reference-quantized-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-quantized-gpu/general) | [Guide](./meta-reference-quantized-gpu/) | meta-reference-quantized | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference | | Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](./ollama/) | remote::ollama | meta-reference | remote::pgvector; remote::chromadb | remote::ollama | meta-reference | | TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](./tgi/) | remote::tgi | meta-reference | meta-reference; remote::pgvector; remote::chromadb | meta-reference | meta-reference | | Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](./together/) | remote::together | meta-reference | remote::weaviate | meta-reference | meta-reference | diff --git a/distributions/meta-reference-quantized-gpu/README.md b/distributions/meta-reference-quantized-gpu/README.md new file mode 100644 index 000000000..0c05a13c1 --- /dev/null +++ b/distributions/meta-reference-quantized-gpu/README.md @@ -0,0 +1,34 @@ +# Meta Reference Quantized Distribution + +The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations. + + +| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|----------------- |------------------------ |---------------- |-------------------------------------------------- |---------------- |---------------- | +| **Provider(s)** | meta-reference-quantized | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference | + +The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc. + +### Start the Distribution (Single Node GPU) + +> [!NOTE] +> This assumes you have access to GPU to start a local server with access to your GPU. + + +> [!NOTE] +> `~/.llama` should be the path containing downloaded weights of Llama models. + + +To download and start running a pre-built docker container, you may use the following commands: + +``` +docker run -it -p 5000:5000 -v ~/.llama:/root/.llama \ + -v ./run.yaml:/root/my-run.yaml \ + --gpus=all \ + distribution-meta-reference-quantized-gpu \ + --yaml_config /root/my-run.yaml +``` + +### Alternative (Build and start distribution locally via conda) + +- You may checkout the [Getting Started](../../docs/getting_started.md) for more details on building locally via conda and starting up the distribution. diff --git a/distributions/meta-reference-quantized-gpu/build.yaml b/distributions/meta-reference-quantized-gpu/build.yaml new file mode 100644 index 000000000..e9ddb4aad --- /dev/null +++ b/distributions/meta-reference-quantized-gpu/build.yaml @@ -0,0 +1,14 @@ +name: meta-reference-quantized-gpu +distribution_spec: + docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime + description: Use code from `llama_stack` itself to serve all llama stack APIs + providers: + inference: meta-reference-quantized + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference +image_type: docker diff --git a/distributions/meta-reference-quantized-gpu/run.yaml b/distributions/meta-reference-quantized-gpu/run.yaml new file mode 100644 index 000000000..6e8be2b6d --- /dev/null +++ b/distributions/meta-reference-quantized-gpu/run.yaml @@ -0,0 +1,51 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: meta0 + provider_type: meta-reference-quantized + config: + model: Llama3.2-3B-Instruct + quantization: + type: fp8 + torch_seed: null + max_seq_len: 2048 + max_batch_size: 1 + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/llama_stack/distribution/build_container.sh b/llama_stack/distribution/build_container.sh index 19f3df1e3..3bf74edcf 100755 --- a/llama_stack/distribution/build_container.sh +++ b/llama_stack/distribution/build_container.sh @@ -97,7 +97,7 @@ if [ -n "$pip_dependencies" ]; then fi if [ -n "$special_pip_deps" ]; then - IFS='#' read -ra parts <<< "$special_pip_deps" + IFS='#' read -ra parts <<<"$special_pip_deps" for part in "${parts[@]}"; do add_to_docker "RUN pip install $part" done @@ -127,7 +127,7 @@ if [ -n "$LLAMA_MODELS_DIR" ]; then mounts="$mounts -v $(readlink -f $LLAMA_MODELS_DIR):$models_mount" fi -if command -v selinuxenabled &> /dev/null && selinuxenabled; then +if command -v selinuxenabled &>/dev/null && selinuxenabled; then # Disable SELinux labels -- we don't want to relabel the llama-stack source dir DOCKER_OPTS="$DOCKER_OPTS --security-opt label=disable" fi @@ -139,4 +139,4 @@ $DOCKER_BINARY build $DOCKER_OPTS -t $image_name -f "$TEMP_DIR/Dockerfile" "$REP rm -rf $REPO_CONFIGS_DIR set +x -echo "Success! You can run it with: $DOCKER_BINARY $DOCKER_OPTS run -p 5000:5000 $image_name" +echo "Success!" diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index 6f8bc2c6e..28555755b 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -36,7 +36,7 @@ def available_providers() -> List[ProviderSpec]: pip_packages=( META_REFERENCE_DEPS + [ - "fbgemm-gpu==0.8.0", + "fbgemm-gpu", ] ), module="llama_stack.providers.impls.meta_reference.inference",