chore: remove distributions folder (#1801)

# What does this PR do? - the distribution folder is referencing template, and have dead docker compose scripts [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [//]: # (## Documentation)
2025-03-26 15:07:54 -07:00 · 2025-03-26 15:07:54 -07:00 · 742020b94a
commit 742020b94a
parent f8445b0d69
45 changed files with 0 additions and 742 deletions
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -1,15 +0,0 @@
 services:
  llamastack:
    image: distribution-bedrock
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-bedrock.yaml
    ports:
      - "8321:8321"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/cerebras/build.yaml
+++ b/distributions/cerebras/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/cerebras/build.yaml
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@ -1,16 +0,0 @@
 services:
  llamastack:
    image: llamastack/distribution-cerebras
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-cerebras.yaml
    ports:
      - "8321:8321"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/cerebras/run.yaml
+++ b/distributions/cerebras/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/cerebras/run.yaml
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -1,50 +0,0 @@
 services:
  text-generation-inference:
    image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
    network_mode: "host"
    volumes:
      - $HOME/.cache/huggingface:/data
    ports:
      - "5009:5009"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
      - NUM_SHARD=4
      - MAX_BATCH_PREFILL_TOKENS=32768
      - MAX_INPUT_TOKENS=8000
      - MAX_TOTAL_TOKENS=8192
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: all
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
  llamastack:
    depends_on:
      text-generation-inference:
        condition: service_healthy
    image: llamastack/distribution-tgi
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      # Link to TGI run.yaml file
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    # Hack: wait for TGI server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
      condition: on-failure
      delay: 3s
      max_attempts: 5
      window: 60s
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -1,44 +0,0 @@
 version: '2'
 image_name: local
 container_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: tgi0
    provider_type: remote::tgi
    config:
      url: http://127.0.0.1:80
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  - provider_id: meta1
    provider_type: inline::prompt-guard
    config:
      model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: inline::faiss
    config: {}
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/kvstore.db
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -1,14 +0,0 @@
 services:
  llamastack:
    image: llamastack/distribution-fireworks
    ports:
      - "8321:8321"
    environment:
      - FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -1,34 +0,0 @@
 services:
  llamastack:
    image: llamastack/distribution-meta-reference-gpu
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
    runtime: nvidia
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -1,35 +0,0 @@
 services:
  llamastack:
    image: llamastack/distribution-meta-reference-quantized-gpu
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -1,58 +0,0 @@
 version: '2'
 image_name: local
 container_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: meta0
    provider_type: inline::meta-reference-quantized
    config:
      model: Llama3.2-3B-Instruct:int4-qlora-eo8
      quantization:
        type: int4
      torch_seed: null
      max_seq_len: 2048
      max_batch_size: 1
  - provider_id: meta1
    provider_type: inline::meta-reference-quantized
    config:
      # not a quantized model !
      model: Llama-Guard-3-1B
      quantization: null
      torch_seed: null
      max_seq_len: 2048
      max_batch_size: 1
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  - provider_id: meta1
    provider_type: inline::prompt-guard
    config:
      model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/kvstore.db
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -1,71 +0,0 @@
 services:
  ollama:
    image: ollama/ollama:latest
    network_mode: ${NETWORK_MODE:-bridge}
    volumes:
      - ~/.ollama:/root/.ollama
    ports:
      - "11434:11434"
    environment:
      OLLAMA_DEBUG: 1
    command: []
    deploy:
      resources:
        limits:
          memory: 8G    # Set maximum memory
        reservations:
          memory: 8G    # Set minimum memory reservation
    # healthcheck:
    #   # ugh, no CURL in ollama image
    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
    #   interval: 10s
    #   timeout: 5s
    #   retries: 5
  ollama-init:
    image: ollama/ollama:latest
    depends_on:
      - ollama
        # condition: service_healthy
    network_mode: ${NETWORK_MODE:-bridge}
    environment:
      - OLLAMA_HOST=ollama
      - INFERENCE_MODEL=${INFERENCE_MODEL}
      - SAFETY_MODEL=${SAFETY_MODEL:-}
    volumes:
      - ~/.ollama:/root/.ollama
      - ./pull-models.sh:/pull-models.sh
    entrypoint: ["/pull-models.sh"]
  llamastack:
    depends_on:
      ollama:
        condition: service_started
      ollama-init:
        condition: service_started
    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
    network_mode: ${NETWORK_MODE:-bridge}
    volumes:
      - ~/.llama:/root/.llama
      # Link to ollama run.yaml file
      - ~/local/llama-stack/:/app/llama-stack-source
      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
    ports:
      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
    environment:
      - INFERENCE_MODEL=${INFERENCE_MODEL}
      - SAFETY_MODEL=${SAFETY_MODEL:-}
      - OLLAMA_URL=http://ollama:11434
    entrypoint: >
        python -m llama_stack.distribution.server.server /root/my-run.yaml \
        --port ${LLAMA_STACK_PORT:-8321}
    deploy:
      restart_policy:
        condition: on-failure
        delay: 10s
        max_attempts: 3
        window: 60s
 volumes:
  ollama:
  ollama-init:
  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -1,18 +0,0 @@
 #!/bin/sh
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
 for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
  echo "Preloading $model..."
  if ! ollama run "$model"; then
    echo "Failed to pull and run $model"
    exit 1
  fi
 done
 echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/ollama/run.yaml
--- a/distributions/ramalama/faiss_store.db
+++ b/distributions/ramalama/faiss_store.db
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -1,19 +0,0 @@
 services:
  llamastack:
    image: distribution-nvidia:dev
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-nvidia.yaml
    ports:
      - "8321:8321"
    environment:
      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/nvidia/run.yaml
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -1,99 +0,0 @@
 services:
  vllm-inference:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
      --port ${VLLM_INFERENCE_PORT:-5100}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 5
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  # A little trick:
  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
  vllm-${VLLM_SAFETY_MODEL:+safety}:
    image: vllm/vllm-openai:latest
    volumes:
      - $HOME/.cache/huggingface:/root/.cache/huggingface
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
    command: >
      --gpu-memory-utilization 0.75
      --model ${VLLM_SAFETY_MODEL}
      --enforce-eager
      --max-model-len 8192
      --max-num-seqs 16
      --port ${VLLM_SAFETY_PORT:-5101}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 5
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  llamastack:
    depends_on:
      - vllm-inference:
          condition: service_healthy
      - vllm-${VLLM_SAFETY_MODEL:+safety}:
          condition: service_healthy
    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
    volumes:
      - ~/.llama:/root/.llama
      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
    network_mode: ${NETWORK_MODE:-bridged}
    environment:
      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - MAX_TOKENS=${MAX_TOKENS:-4096}
      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
    ports:
      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
    # Hack: wait for vLLM server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
 volumes:
  vllm-inference:
  vllm-safety:
  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/runpod/build.yaml
+++ b/distributions/runpod/build.yaml
@ -1,9 +0,0 @@
 name: runpod
 distribution_spec:
  description: Use Runpod for running LLM inference
  providers:
    inference: remote::runpod
    memory: meta-reference
    safety: meta-reference
    agents: meta-reference
    telemetry: meta-reference
--- a/distributions/sambanova/build.yaml
+++ b/distributions/sambanova/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/sambanova/build.yaml
--- a/distributions/sambanova/compose.yaml
+++ b/distributions/sambanova/compose.yaml
@ -1,16 +0,0 @@
 services:
  llamastack:
    image: llamastack/distribution-sambanova
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/llamastack-run-sambanova.yaml
    ports:
      - "5000:5000"
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/sambanova/run.yaml
+++ b/distributions/sambanova/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/sambanova/run.yaml
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/tgi/build.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -1,103 +0,0 @@
 services:
  tgi-inference:
    image: ghcr.io/huggingface/text-generation-inference:latest
    volumes:
      - $HOME/.cache/huggingface:/data
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
      - HF_TOKEN=$HF_TOKEN
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
    command: >
      --dtype bfloat16
      --usage-stats off
      --sharded false
      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      --port ${TGI_INFERENCE_PORT:-8080}
      --cuda-memory-fraction 0.75
    healthcheck:
      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
      interval: 5s
      timeout: 5s
      retries: 30
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  tgi-${TGI_SAFETY_MODEL:+safety}:
    image: ghcr.io/huggingface/text-generation-inference:latest
    volumes:
      - $HOME/.cache/huggingface:/data
    network_mode: ${NETWORK_MODE:-bridged}
    ports:
       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
      - HF_TOKEN=$HF_TOKEN
      - HF_HOME=/data
      - HF_DATASETS_CACHE=/data
      - HF_MODULES_CACHE=/data
      - HF_HUB_CACHE=/data
    command: >
      --dtype bfloat16
      --usage-stats off
      --sharded false
      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
      --port ${TGI_SAFETY_PORT:-8081}
      --cuda-memory-fraction 0.75
    healthcheck:
      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
      interval: 5s
      timeout: 5s
      retries: 30
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            capabilities: [gpu]
    runtime: nvidia
  llamastack:
    depends_on:
      tgi-inference:
        condition: service_healthy
      tgi-${TGI_SAFETY_MODEL:+safety}:
        condition: service_healthy
    image: llamastack/distribution-tgi:test-0.0.52rc3
    network_mode: ${NETWORK_MODE:-bridged}
    volumes:
      - ~/.llama:/root/.llama
      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
    ports:
      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
    # Hack: wait for TGI server to start before starting docker
    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    restart_policy:
      condition: on-failure
      delay: 3s
      max_attempts: 5
      window: 60s
    environment:
      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 volumes:
  tgi-inference:
  tgi-safety:
  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/together/build.yaml
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -1,14 +0,0 @@
 services:
  llamastack:
    image: llamastack/distribution-together
    ports:
      - "8321:8321"
    environment:
      - TOGETHER_API_KEY=${TOGETHER_API_KEY}
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/together/run.yaml
--- a/distributions/vllm-gpu/build.yaml
+++ b/distributions/vllm-gpu/build.yaml
@ -1 +0,0 @@
 ../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -1,35 +0,0 @@
 services:
  llamastack:
    image: llamastack/distribution-inline-vllm
    network_mode: "host"
    volumes:
      - ~/.llama:/root/.llama
      - ./run.yaml:/root/my-run.yaml
    ports:
      - "8321:8321"
    devices:
      - nvidia.com/gpu=all
    environment:
      - CUDA_VISIBLE_DEVICES=0
    command: []
    deploy:
      resources:
        reservations:
          devices:
          - driver: nvidia
            # that's the closest analogue to --gpus; provide
            # an integer amount of devices or 'all'
            count: 1
            # Devices are reserved using a list of capabilities, making
            # capabilities the only required field. A device MUST
            # satisfy all the requested capabilities for a successful
            # reservation.
            capabilities: [gpu]
    runtime: nvidia
    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
    deploy:
      restart_policy:
        condition: on-failure
        delay: 3s
        max_attempts: 5
        window: 60s
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -1,66 +0,0 @@
 version: '2'
 image_name: local
 container_image: null
 conda_env: local
 apis:
 - shields
 - agents
 - models
 - memory
 - memory_banks
 - inference
 - safety
 providers:
  inference:
  - provider_id: vllm-inference
    provider_type: inline::vllm
    config:
      model: Llama3.2-3B-Instruct
      tensor_parallel_size: 1
      gpu_memory_utilization: 0.4
      enforce_eager: true
      max_tokens: 4096
  - provider_id: vllm-inference-safety
    provider_type: inline::vllm
    config:
      model: Llama-Guard-3-1B
      tensor_parallel_size: 1
      gpu_memory_utilization: 0.2
      enforce_eager: true
      max_tokens: 4096
  safety:
  - provider_id: meta0
    provider_type: inline::llama-guard
    config:
      model: Llama-Guard-3-1B
      excluded_categories: []
  # Uncomment to use prompt guard
  # - provider_id: meta1
  #   provider_type: inline::prompt-guard
  #   config:
  #     model: Prompt-Guard-86M
  memory:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
  # Uncomment to use pgvector
  # - provider_id: pgvector
  #   provider_type: remote::pgvector
  #   config:
  #     host: 127.0.0.1
  #     port: 5432
  #     db: postgres
  #     user: postgres
  #     password: mysecretpassword
  agents:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config:
      persistence_store:
        namespace: null
        type: sqlite
        db_path: ~/.llama/runtime/agents_store.db
  telemetry:
  - provider_id: meta0
    provider_type: inline::meta-reference
    config: {}
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/bedrock/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/bedrock/run.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/cerebras/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/cerebras/run.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/fireworks/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/fireworks/run.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/meta-reference-gpu/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/ollama/build.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/ollama/run-with-safety.yaml`
		`@ -1 +0,0 @@`
			`../../llama_stack/templates/nvidia/build.yaml`