kill tgi/cpu

2024-11-08 10:33:45 -08:00 · 2024-11-08 10:33:45 -08:00 · b1d7376730
commit b1d7376730
parent 6192bf43a4
5 changed files with 1 additions and 97 deletions
--- a/distributions/tgi/gpu/compose.yaml
+++ b/distributions/tgi/gpu/compose.yaml
--- a/distributions/tgi/cpu/compose.yaml
+++ b/distributions/tgi/cpu/compose.yaml
@ -1,33 +0,0 @@
 services:
  text-generation-inference:
    image: ghcr.io/huggingface/text-generation-inference:latest
    network_mode: "host"
    volumes:
      - $HOME/.cache/huggingface:/data
    ports:
      - "5009:5009"
    command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"]
    runtime: nvidia
    healthcheck:
      test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"]
      interval: 5s
      timeout: 5s
      retries: 30
  llamastack:
    depends_on:
      text-generation-inference:
        condition: service_healthy
    image: llamastack/llamastack-tgi
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      # Link to run.yaml file
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "5000:5000"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
      condition: on-failure
      delay: 3s
      max_attempts: 5
      window: 60s
--- a/distributions/tgi/cpu/run.yaml
+++ b/distributions/tgi/cpu/run.yaml
@ -1,46 +0,0 @@
 version: '2'
 built_at: '2024-10-08T17:40:45.325529'
 image_name: local
 docker_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: <ENTER_YOUR_TGI_HOSTED_ENDPOINT>
  safety:
  - provider_id: meta0
    provider_type: meta-reference
    config:
      llama_guard_shield:
        model: Llama-Guard-3-1B
        excluded_categories: []
        disable_input_check: false
        disable_output_check: false
      prompt_guard_shield:
        model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: meta-reference
    config: {}
  agents:
  - provider_id: meta0
    provider_type: meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/kvstore.db
  telemetry:
  - provider_id: meta0
    provider_type: meta-reference
    config: {}
--- a/distributions/tgi/gpu/run.yaml
+++ b/distributions/tgi/gpu/run.yaml
--- a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md
@ -15,7 +15,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide
 ```
-$ cd distributions/tgi/gpu && docker compose up
+$ cd distributions/tgi && docker compose up
 ```
 The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs --
@ -34,23 +34,6 @@ To kill the server
 docker compose down
 ```
 ### Docker: Start the Distribution (Single Node CPU)
 > [!NOTE]
 > This assumes you have an hosted endpoint compatible with TGI server.
 ```
 $ cd distributions/tgi/cpu && docker compose up
 ```
 Replace <ENTER_YOUR_TGI_HOSTED_ENDPOINT> in `run.yaml` file with your TGI endpoint.
 ```
 inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: <ENTER_YOUR_TGI_HOSTED_ENDPOINT>
 ```
 ### Conda: TGI server + llama stack run