diff --git a/distributions/tgi/gpu/compose.yaml b/distributions/tgi/compose.yaml similarity index 100% rename from distributions/tgi/gpu/compose.yaml rename to distributions/tgi/compose.yaml diff --git a/distributions/tgi/cpu/compose.yaml b/distributions/tgi/cpu/compose.yaml deleted file mode 100644 index 3ff6345e2..000000000 --- a/distributions/tgi/cpu/compose.yaml +++ /dev/null @@ -1,33 +0,0 @@ -services: - text-generation-inference: - image: ghcr.io/huggingface/text-generation-inference:latest - network_mode: "host" - volumes: - - $HOME/.cache/huggingface:/data - ports: - - "5009:5009" - command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] - runtime: nvidia - healthcheck: - test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"] - interval: 5s - timeout: 5s - retries: 30 - llamastack: - depends_on: - text-generation-inference: - condition: service_healthy - image: llamastack/llamastack-tgi - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - # Link to run.yaml file - - ./run.yaml:/root/my-run.yaml - ports: - - "5000:5000" - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/tgi/cpu/run.yaml b/distributions/tgi/cpu/run.yaml deleted file mode 100644 index bf46391b4..000000000 --- a/distributions/tgi/cpu/run.yaml +++ /dev/null @@ -1,46 +0,0 @@ -version: '2' -built_at: '2024-10-08T17:40:45.325529' -image_name: local -docker_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: - safety: - - provider_id: meta0 - provider_type: meta-reference - config: - llama_guard_shield: - model: Llama-Guard-3-1B - excluded_categories: [] - disable_input_check: false - disable_output_check: false - prompt_guard_shield: - model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: meta-reference - config: {} - agents: - - provider_id: meta0 - provider_type: meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: meta-reference - config: {} diff --git a/distributions/tgi/gpu/run.yaml b/distributions/tgi/run.yaml similarity index 100% rename from distributions/tgi/gpu/run.yaml rename to distributions/tgi/run.yaml diff --git a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md index 3ee079360..8ad9de181 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md @@ -15,7 +15,7 @@ The `llamastack/distribution-tgi` distribution consists of the following provide ``` -$ cd distributions/tgi/gpu && docker compose up +$ cd distributions/tgi && docker compose up ``` The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- @@ -34,23 +34,6 @@ To kill the server docker compose down ``` -### Docker: Start the Distribution (Single Node CPU) - -> [!NOTE] -> This assumes you have an hosted endpoint compatible with TGI server. - -``` -$ cd distributions/tgi/cpu && docker compose up -``` - -Replace in `run.yaml` file with your TGI endpoint. -``` -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: -``` ### Conda: TGI server + llama stack run