diff --git a/scripts/docker/tgi/compose.yaml b/scripts/docker/tgi/compose.yaml new file mode 100644 index 000000000..d0a1f8c04 --- /dev/null +++ b/scripts/docker/tgi/compose.yaml @@ -0,0 +1,55 @@ +services: + text-generation-inference: + image: ghcr.io/huggingface/text-generation-inference:latest + network_mode: "host" + volumes: + - $HOME/.cache/huggingface:/data + ports: + - "5009:5009" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=0 + - HF_HOME=/data + - HF_DATASETS_CACHE=/data + - HF_MODULES_CACHE=/data + - HF_HUB_CACHE=/data + command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5009/health"] + interval: 5s + timeout: 5s + retries: 10 + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + llamastack-local-cpu: + depends_on: + text-generation-inference: + condition: service_healthy + restart: on-failure + image: llamastack-local-cpu + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to TGI run.yaml file + - ./tgi-run.yaml:/root/llamastack-run-tgi.yaml + ports: + - "5000:5000" + command: ["--yaml_config", "/root/llamastack-run-tgi.yaml"] + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5009/health"] + interval: 5s + timeout: 5s + retries: 10 diff --git a/scripts/docker/tgi/tgi-run.yaml b/scripts/docker/tgi/tgi-run.yaml new file mode 100644 index 000000000..e75f12db7 --- /dev/null +++ b/scripts/docker/tgi/tgi-run.yaml @@ -0,0 +1,59 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +# - evals +providers: + # evals: + # - provider_id: eleuther + # provider_type: eleuther + # config: {} + inference: + - provider_id: remote::tgi + provider_type: remote::tgi + config: + url: http://127.0.0.1:5009 + # - provider_id: meta-reference + # provider_type: meta-reference + # config: + # model: Llama3.1-8B-Instruct + # quantization: null + # torch_seed: null + # max_seq_len: 4096 + # max_batch_size: 1 + safety: + - provider_id: meta-reference + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta-reference + provider_type: meta-reference + config: {} + agents: + - provider_id: meta-reference + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: /home/xiyan/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta-reference + provider_type: meta-reference + config: {}