From b281ae343ac92370375c5027355372cef1318b2a Mon Sep 17 00:00:00 2001 From: Hardik Shah Date: Thu, 6 Feb 2025 11:45:54 -0800 Subject: [PATCH] point to DEH for infernece --- .../distributions/self_hosted_distro/dell.md | 69 +++++++++---------- llama_stack/templates/dell/doc_template.md | 69 +++++++++---------- 2 files changed, 68 insertions(+), 70 deletions(-) diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md index 8a90bdc87..a6a9af560 100644 --- a/docs/source/distributions/self_hosted_distro/dell.md +++ b/docs/source/distributions/self_hosted_distro/dell.md @@ -39,9 +39,9 @@ The following environment variables can be configured: - `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) -## Setting up TGI server +## Setting up Inference server using Dell Enterprise Hub's custom TGI container -Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: +You can ```bash export INFERENCE_PORT=8181 @@ -53,19 +53,18 @@ export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT export CUDA_VISIBLE_DEVICES=0 export LLAMA_STACK_PORT=8321 -docker run --rm -it \ ---network host \ --v $HOME/.cache/huggingface:/data \ --e HF_TOKEN=$HF_TOKEN \ --p $INFERENCE_PORT:$INFERENCE_PORT \ ---gpus $CUDA_VISIBLE_DEVICES \ -ghcr.io/huggingface/text-generation-inference \ ---dtype bfloat16 \ ---usage-stats off \ ---sharded false \ ---cuda-memory-fraction 0.7 \ ---model-id $INFERENCE_MODEL \ ---port $INFERENCE_PORT --hostname 0.0.0.0 +docker run \ + -it \ + --network host \ + --shm-size 1g \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ + --gpus $CUDA_VISIBLE_DEVICES \ + -e NUM_SHARD=1 \ + -e MAX_BATCH_PREFILL_TOKENS=32768 \ + -e MAX_INPUT_TOKENS=8000 \ + -e MAX_TOTAL_TOKENS=8192 \ + -e RUST_BACKTRACE=full \ + registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct ``` If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: @@ -92,21 +91,21 @@ ghcr.io/huggingface/text-generation-inference \ --port $SAFETY_INFERENCE_PORT ``` -## Dell distribution relies on ChromDB for vector database usage +## Dell distribution relies on ChromaDB for vector database usage -You can start a chrom-db easily using docker. +You can start a chroma-db easily using docker. ```bash # This is where the indices are persisted mkdir -p chromadb podman run --rm -it \ ---network host \ ---name chromadb \ --v ./chromadb:/chroma/chroma \ --e IS_PERSISTENT=TRUE \ -chromadb/chroma:latest \ ---port $CHROMADB_PORT \ ---host $CHROMADB_HOST + --network host \ + --name chromadb \ + -v ./chromadb:/chroma/chroma \ + -e IS_PERSISTENT=TRUE \ + chromadb/chroma:latest \ + --port $CHROMADB_PORT \ + --host $CHROMADB_HOST ``` ## Running Llama Stack @@ -119,17 +118,17 @@ This method allows you to get started quickly without having to build the distri ```bash docker run -it \ ---network host \ --p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ --v ~/.llama:/root/.llama \ -# NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed --v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \ -# localhost/distribution-dell:dev if building / testing locally -llamastack/distribution-dell\ ---port $LLAMA_STACK_PORT \ ---env INFERENCE_MODEL=$INFERENCE_MODEL \ ---env DEH_URL=$DEH_URL \ ---env CHROMA_URL=$CHROMA_URL + --network host \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed + -v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \ + # localhost/distribution-dell:dev if building / testing locally + llamastack/distribution-dell\ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env DEH_URL=$DEH_URL \ + --env CHROMA_URL=$CHROMA_URL ``` diff --git a/llama_stack/templates/dell/doc_template.md b/llama_stack/templates/dell/doc_template.md index d7519328a..d8139ddc9 100644 --- a/llama_stack/templates/dell/doc_template.md +++ b/llama_stack/templates/dell/doc_template.md @@ -28,9 +28,9 @@ The following environment variables can be configured: {% endif %} -## Setting up TGI server +## Setting up Inference server using Dell Enterprise Hub's custom TGI container -Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: +You can ```bash export INFERENCE_PORT=8181 @@ -42,19 +42,18 @@ export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT export CUDA_VISIBLE_DEVICES=0 export LLAMA_STACK_PORT=8321 -docker run --rm -it \ ---network host \ --v $HOME/.cache/huggingface:/data \ --e HF_TOKEN=$HF_TOKEN \ --p $INFERENCE_PORT:$INFERENCE_PORT \ ---gpus $CUDA_VISIBLE_DEVICES \ -ghcr.io/huggingface/text-generation-inference \ ---dtype bfloat16 \ ---usage-stats off \ ---sharded false \ ---cuda-memory-fraction 0.7 \ ---model-id $INFERENCE_MODEL \ ---port $INFERENCE_PORT --hostname 0.0.0.0 +docker run \ + -it \ + --network host \ + --shm-size 1g \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ + --gpus $CUDA_VISIBLE_DEVICES \ + -e NUM_SHARD=1 \ + -e MAX_BATCH_PREFILL_TOKENS=32768 \ + -e MAX_INPUT_TOKENS=8000 \ + -e MAX_TOTAL_TOKENS=8192 \ + -e RUST_BACKTRACE=full \ + registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct ``` If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: @@ -81,21 +80,21 @@ ghcr.io/huggingface/text-generation-inference \ --port $SAFETY_INFERENCE_PORT ``` -## Dell distribution relies on ChromDB for vector database usage +## Dell distribution relies on ChromaDB for vector database usage -You can start a chrom-db easily using docker. +You can start a chroma-db easily using docker. ```bash # This is where the indices are persisted mkdir -p chromadb podman run --rm -it \ ---network host \ ---name chromadb \ --v ./chromadb:/chroma/chroma \ --e IS_PERSISTENT=TRUE \ -chromadb/chroma:latest \ ---port $CHROMADB_PORT \ ---host $CHROMADB_HOST + --network host \ + --name chromadb \ + -v ./chromadb:/chroma/chroma \ + -e IS_PERSISTENT=TRUE \ + chromadb/chroma:latest \ + --port $CHROMADB_PORT \ + --host $CHROMADB_HOST ``` ## Running Llama Stack @@ -108,17 +107,17 @@ This method allows you to get started quickly without having to build the distri ```bash docker run -it \ ---network host \ --p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ --v ~/.llama:/root/.llama \ -# NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed --v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \ -# localhost/distribution-dell:dev if building / testing locally -llamastack/distribution-{{ name }}\ ---port $LLAMA_STACK_PORT \ ---env INFERENCE_MODEL=$INFERENCE_MODEL \ ---env DEH_URL=$DEH_URL \ ---env CHROMA_URL=$CHROMA_URL + --network host \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + # NOTE: mount the llama-stack / llama-model directories if testing local changes else not needed + -v /home/hjshah/git/llama-stack:/app/llama-stack-source -v /home/hjshah/git/llama-models:/app/llama-models-source \ + # localhost/distribution-dell:dev if building / testing locally + llamastack/distribution-{{ name }}\ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env DEH_URL=$DEH_URL \ + --env CHROMA_URL=$CHROMA_URL ```