kill tgi/cpu

This commit is contained in:
Xi Yan 2024-11-08 10:33:45 -08:00
parent 6192bf43a4
commit b1d7376730
5 changed files with 1 additions and 97 deletions

View file

@ -1,33 +0,0 @@
services:
text-generation-inference:
image: ghcr.io/huggingface/text-generation-inference:latest
network_mode: "host"
volumes:
- $HOME/.cache/huggingface:/data
ports:
- "5009:5009"
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
runtime: nvidia
healthcheck:
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
interval: 5s
timeout: 5s
retries: 30
llamastack:
depends_on:
text-generation-inference:
condition: service_healthy
image: llamastack/llamastack-tgi
network_mode: "host"
volumes:
- ~/.llama:/root/.llama
# Link to run.yaml file
- ./run.yaml:/root/my-run.yaml
ports:
- "5000:5000"
entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
restart_policy:
condition: on-failure
delay: 3s
max_attempts: 5
window: 60s

View file

@ -1,46 +0,0 @@
version: '2'
built_at: '2024-10-08T17:40:45.325529'
image_name: local
docker_image: null
conda_env: local
apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
providers:
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: <ENTER_YOUR_TGI_HOSTED_ENDPOINT>
safety:
- provider_id: meta0
provider_type: meta-reference
config:
llama_guard_shield:
model: Llama-Guard-3-1B
excluded_categories: []
disable_input_check: false
disable_output_check: false
prompt_guard_shield:
model: Prompt-Guard-86M
memory:
- provider_id: meta0
provider_type: meta-reference
config: {}
agents:
- provider_id: meta0
provider_type: meta-reference
config:
persistence_store:
namespace: null
type: sqlite
db_path: ~/.llama/runtime/kvstore.db
telemetry:
- provider_id: meta0
provider_type: meta-reference
config: {}

View file

@ -15,7 +15,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
``` ```
$ cd distributions/tgi/gpu && docker compose up $ cd distributions/tgi && docker compose up
``` ```
The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs --
@ -34,23 +34,6 @@ To kill the server
docker compose down docker compose down
``` ```
### Docker: Start the Distribution (Single Node CPU)
> [!NOTE]
> This assumes you have an hosted endpoint compatible with TGI server.
```
$ cd distributions/tgi/cpu && docker compose up
```
Replace <ENTER_YOUR_TGI_HOSTED_ENDPOINT> in `run.yaml` file with your TGI endpoint.
```
inference:
- provider_id: tgi0
provider_type: remote::tgi
config:
url: <ENTER_YOUR_TGI_HOSTED_ENDPOINT>
```
### Conda: TGI server + llama stack run ### Conda: TGI server + llama stack run