chore: remove distributions folder (#1801)

# What does this PR do? - the distribution folder is referencing template, and have dead docker compose scripts [//]: # (If resolving an issue, uncomment and update the line below) [//]: # (Closes #[issue-number]) ## Test Plan [//]: # (## Documentation)
2025-03-26 15:07:54 -07:00 · 2025-03-26 15:07:54 -07:00 · 742020b94a
commit 742020b94a
parent f8445b0d69
45 changed files with 0 additions and 742 deletions
--- a/distributions/bedrock/build.yaml
+++ b/distributions/bedrock/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/bedrock/build.yaml
--- a/distributions/bedrock/compose.yaml
+++ b/distributions/bedrock/compose.yaml
@ -1,15 +0,0 @@
-services:
-  llamastack:
-    image: distribution-bedrock
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-bedrock.yaml
-    ports:
-      - "8321:8321"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/bedrock/run.yaml
+++ b/distributions/bedrock/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/bedrock/run.yaml
--- a/distributions/cerebras/build.yaml
+++ b/distributions/cerebras/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/cerebras/build.yaml
--- a/distributions/cerebras/compose.yaml
+++ b/distributions/cerebras/compose.yaml
@ -1,16 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-cerebras
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-cerebras.yaml
-    ports:
-      - "8321:8321"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/cerebras/run.yaml
+++ b/distributions/cerebras/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/cerebras/run.yaml
--- a/distributions/dell-tgi/compose.yaml
+++ b/distributions/dell-tgi/compose.yaml
@ -1,50 +0,0 @@
-services:
-  text-generation-inference:
-    image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct
-    network_mode: "host"
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    ports:
-      - "5009:5009"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0,1,2,3,4
-      - NUM_SHARD=4
-      - MAX_BATCH_PREFILL_TOKENS=32768
-      - MAX_INPUT_TOKENS=8000
-      - MAX_TOTAL_TOKENS=8192
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: all
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-      text-generation-inference:
-        condition: service_healthy
-    image: llamastack/distribution-tgi
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to TGI run.yaml file
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    # Hack: wait for TGI server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    restart_policy:
-      condition: on-failure
-      delay: 3s
-      max_attempts: 5
-      window: 60s
--- a/distributions/dell-tgi/run.yaml
+++ b/distributions/dell-tgi/run.yaml
@ -1,44 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: tgi0
-    provider_type: remote::tgi
-    config:
-      url: http://127.0.0.1:80
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::faiss
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/fireworks/build.yaml
+++ b/distributions/fireworks/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/fireworks/build.yaml
--- a/distributions/fireworks/compose.yaml
+++ b/distributions/fireworks/compose.yaml
@ -1,14 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-fireworks
-    ports:
-      - "8321:8321"
-    environment:
-      - FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/fireworks/run.yaml
+++ b/distributions/fireworks/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/fireworks/run.yaml
--- a/distributions/meta-reference-gpu/build.yaml
+++ b/distributions/meta-reference-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/build.yaml
--- a/distributions/meta-reference-gpu/compose.yaml
+++ b/distributions/meta-reference-gpu/compose.yaml
@ -1,34 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-meta-reference-gpu
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
--- a/distributions/meta-reference-gpu/run-with-safety.yaml
+++ b/distributions/meta-reference-gpu/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
--- a/distributions/meta-reference-gpu/run.yaml
+++ b/distributions/meta-reference-gpu/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-gpu/run.yaml
--- a/distributions/meta-reference-quantized-gpu/build.yaml
+++ b/distributions/meta-reference-quantized-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml
--- a/distributions/meta-reference-quantized-gpu/compose.yaml
+++ b/distributions/meta-reference-quantized-gpu/compose.yaml
@ -1,35 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-meta-reference-quantized-gpu
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/meta-reference-quantized-gpu/run.yaml
+++ b/distributions/meta-reference-quantized-gpu/run.yaml
@ -1,58 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: meta0
-    provider_type: inline::meta-reference-quantized
-    config:
-      model: Llama3.2-3B-Instruct:int4-qlora-eo8
-      quantization:
-        type: int4
-      torch_seed: null
-      max_seq_len: 2048
-      max_batch_size: 1
-  - provider_id: meta1
-    provider_type: inline::meta-reference-quantized
-    config:
-      # not a quantized model !
-      model: Llama-Guard-3-1B
-      quantization: null
-      torch_seed: null
-      max_seq_len: 2048
-      max_batch_size: 1
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  - provider_id: meta1
-    provider_type: inline::prompt-guard
-    config:
-      model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/kvstore.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
--- a/distributions/ollama/build.yaml
+++ b/distributions/ollama/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/build.yaml
--- a/distributions/ollama/compose.yaml
+++ b/distributions/ollama/compose.yaml
@ -1,71 +0,0 @@
-services:
-  ollama:
-    image: ollama/ollama:latest
-    network_mode: ${NETWORK_MODE:-bridge}
-    volumes:
-      - ~/.ollama:/root/.ollama
-    ports:
-      - "11434:11434"
-    environment:
-      OLLAMA_DEBUG: 1
-    command: []
-    deploy:
-      resources:
-        limits:
-          memory: 8G    # Set maximum memory
-        reservations:
-          memory: 8G    # Set minimum memory reservation
-    # healthcheck:
-    #   # ugh, no CURL in ollama image
-    #   test: ["CMD", "curl", "-f", "http://ollama:11434"]
-    #   interval: 10s
-    #   timeout: 5s
-    #   retries: 5
-
-  ollama-init:
-    image: ollama/ollama:latest
-    depends_on:
-      - ollama
-        # condition: service_healthy
-    network_mode: ${NETWORK_MODE:-bridge}
-    environment:
-      - OLLAMA_HOST=ollama
-      - INFERENCE_MODEL=${INFERENCE_MODEL}
-      - SAFETY_MODEL=${SAFETY_MODEL:-}
-    volumes:
-      - ~/.ollama:/root/.ollama
-      - ./pull-models.sh:/pull-models.sh
-    entrypoint: ["/pull-models.sh"]
-
-  llamastack:
-    depends_on:
-      ollama:
-        condition: service_started
-      ollama-init:
-        condition: service_started
-    image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama}
-    network_mode: ${NETWORK_MODE:-bridge}
-    volumes:
-      - ~/.llama:/root/.llama
-      # Link to ollama run.yaml file
-      - ~/local/llama-stack/:/app/llama-stack-source
-      - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
-    ports:
-      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
-    environment:
-      - INFERENCE_MODEL=${INFERENCE_MODEL}
-      - SAFETY_MODEL=${SAFETY_MODEL:-}
-      - OLLAMA_URL=http://ollama:11434
-    entrypoint: >
-        python -m llama_stack.distribution.server.server /root/my-run.yaml \
-        --port ${LLAMA_STACK_PORT:-8321}
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 10s
-        max_attempts: 3
-        window: 60s
-volumes:
-  ollama:
-  ollama-init:
-  llamastack:
--- a/distributions/ollama/pull-models.sh
+++ b/distributions/ollama/pull-models.sh
@ -1,18 +0,0 @@
-#!/bin/sh
-
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..."
-for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do
-  echo "Preloading $model..."
-  if ! ollama run "$model"; then
-    echo "Failed to pull and run $model"
-    exit 1
-  fi
-done
-
-echo "All models pulled successfully"
--- a/distributions/ollama/run-with-safety.yaml
+++ b/distributions/ollama/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/run-with-safety.yaml
--- a/distributions/ollama/run.yaml
+++ b/distributions/ollama/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/ollama/run.yaml
--- a/distributions/ramalama/faiss_store.db
+++ b/distributions/ramalama/faiss_store.db
--- a/distributions/remote-nvidia/build.yaml
+++ b/distributions/remote-nvidia/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/nvidia/build.yaml
--- a/distributions/remote-nvidia/compose.yaml
+++ b/distributions/remote-nvidia/compose.yaml
@ -1,19 +0,0 @@
-services:
-  llamastack:
-    image: distribution-nvidia:dev
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-nvidia.yaml
-    ports:
-      - "8321:8321"
-    environment:
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct}
-      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/remote-nvidia/run.yaml
+++ b/distributions/remote-nvidia/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/nvidia/run.yaml
--- a/distributions/remote-vllm/build.yaml
+++ b/distributions/remote-vllm/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/build.yaml
--- a/distributions/remote-vllm/compose.yaml
+++ b/distributions/remote-vllm/compose.yaml
@ -1,99 +0,0 @@
-services:
-  vllm-inference:
-    image: vllm/vllm-openai:latest
-    volumes:
-      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0}
-      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
-    command: >
-      --gpu-memory-utilization 0.75
-      --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --enforce-eager
-      --max-model-len 8192
-      --max-num-seqs 16
-      --port ${VLLM_INFERENCE_PORT:-5100}
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  # A little trick:
-  # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model
-  # otherwise, the entry will end in a hyphen which gets ignored by docker compose
-  vllm-${VLLM_SAFETY_MODEL:+safety}:
-    image: vllm/vllm-openai:latest
-    volumes:
-      - $HOME/.cache/huggingface:/root/.cache/huggingface
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-      - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1}
-      - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN
-    command: >
-      --gpu-memory-utilization 0.75
-      --model ${VLLM_SAFETY_MODEL}
-      --enforce-eager
-      --max-model-len 8192
-      --max-num-seqs 16
-      --port ${VLLM_SAFETY_PORT:-5101}
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 5
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-  llamastack:
-    depends_on:
-      - vllm-inference:
-          condition: service_healthy
-      - vllm-${VLLM_SAFETY_MODEL:+safety}:
-          condition: service_healthy
-    image: llamastack/distribution-remote-vllm:test-0.0.52rc3
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml
-    network_mode: ${NETWORK_MODE:-bridged}
-    environment:
-      - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1
-      - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      - MAX_TOKENS=${MAX_TOKENS:-4096}
-      - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm}
-      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-    ports:
-      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
-    # Hack: wait for vLLM server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
-volumes:
-  vllm-inference:
-  vllm-safety:
-  llamastack:
--- a/distributions/remote-vllm/run-with-safety.yaml
+++ b/distributions/remote-vllm/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/run-with-safety.yaml
--- a/distributions/remote-vllm/run.yaml
+++ b/distributions/remote-vllm/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/remote-vllm/run.yaml
--- a/distributions/runpod/build.yaml
+++ b/distributions/runpod/build.yaml
@ -1,9 +0,0 @@
-name: runpod
-distribution_spec:
-  description: Use Runpod for running LLM inference
-  providers:
-    inference: remote::runpod
-    memory: meta-reference
-    safety: meta-reference
-    agents: meta-reference
-    telemetry: meta-reference
--- a/distributions/sambanova/build.yaml
+++ b/distributions/sambanova/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/sambanova/build.yaml
--- a/distributions/sambanova/compose.yaml
+++ b/distributions/sambanova/compose.yaml
@ -1,16 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-sambanova
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/llamastack-run-sambanova.yaml
-    ports:
-      - "5000:5000"
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/sambanova/run.yaml
+++ b/distributions/sambanova/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/sambanova/run.yaml
--- a/distributions/tgi/build.yaml
+++ b/distributions/tgi/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/build.yaml
--- a/distributions/tgi/compose.yaml
+++ b/distributions/tgi/compose.yaml
@ -1,103 +0,0 @@
-services:
-  tgi-inference:
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0}
-      - HF_TOKEN=$HF_TOKEN
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
-    command: >
-      --dtype bfloat16
-      --usage-stats off
-      --sharded false
-      --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      --port ${TGI_INFERENCE_PORT:-8080}
-      --cuda-memory-fraction 0.75
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"]
-      interval: 5s
-      timeout: 5s
-      retries: 30
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  tgi-${TGI_SAFETY_MODEL:+safety}:
-    image: ghcr.io/huggingface/text-generation-inference:latest
-    volumes:
-      - $HOME/.cache/huggingface:/data
-    network_mode: ${NETWORK_MODE:-bridged}
-    ports:
-       - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1}
-      - HF_TOKEN=$HF_TOKEN
-      - HF_HOME=/data
-      - HF_DATASETS_CACHE=/data
-      - HF_MODULES_CACHE=/data
-      - HF_HUB_CACHE=/data
-    command: >
-      --dtype bfloat16
-      --usage-stats off
-      --sharded false
-      --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-      --port ${TGI_SAFETY_PORT:-8081}
-      --cuda-memory-fraction 0.75
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"]
-      interval: 5s
-      timeout: 5s
-      retries: 30
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            capabilities: [gpu]
-    runtime: nvidia
-
-  llamastack:
-    depends_on:
-      tgi-inference:
-        condition: service_healthy
-      tgi-${TGI_SAFETY_MODEL:+safety}:
-        condition: service_healthy
-    image: llamastack/distribution-tgi:test-0.0.52rc3
-    network_mode: ${NETWORK_MODE:-bridged}
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml
-    ports:
-      - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}"
-    # Hack: wait for TGI server to start before starting docker
-    entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    restart_policy:
-      condition: on-failure
-      delay: 3s
-      max_attempts: 5
-      window: 60s
-    environment:
-      - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080}
-      - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081}
-      - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
-      - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
-
-volumes:
-  tgi-inference:
-  tgi-safety:
-  llamastack:
--- a/distributions/tgi/run-with-safety.yaml
+++ b/distributions/tgi/run-with-safety.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/run-with-safety.yaml
--- a/distributions/tgi/run.yaml
+++ b/distributions/tgi/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/tgi/run.yaml
--- a/distributions/together/build.yaml
+++ b/distributions/together/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/together/build.yaml
--- a/distributions/together/compose.yaml
+++ b/distributions/together/compose.yaml
@ -1,14 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-together
-    ports:
-      - "8321:8321"
-    environment:
-      - TOGETHER_API_KEY=${TOGETHER_API_KEY}
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/together/run.yaml
+++ b/distributions/together/run.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/together/run.yaml
--- a/distributions/vllm-gpu/build.yaml
+++ b/distributions/vllm-gpu/build.yaml
@ -1 +0,0 @@
-../../llama_stack/templates/inline-vllm/build.yaml
--- a/distributions/vllm-gpu/compose.yaml
+++ b/distributions/vllm-gpu/compose.yaml
@ -1,35 +0,0 @@
-services:
-  llamastack:
-    image: llamastack/distribution-inline-vllm
-    network_mode: "host"
-    volumes:
-      - ~/.llama:/root/.llama
-      - ./run.yaml:/root/my-run.yaml
-    ports:
-      - "8321:8321"
-    devices:
-      - nvidia.com/gpu=all
-    environment:
-      - CUDA_VISIBLE_DEVICES=0
-    command: []
-    deploy:
-      resources:
-        reservations:
-          devices:
-          - driver: nvidia
-            # that's the closest analogue to --gpus; provide
-            # an integer amount of devices or 'all'
-            count: 1
-            # Devices are reserved using a list of capabilities, making
-            # capabilities the only required field. A device MUST
-            # satisfy all the requested capabilities for a successful
-            # reservation.
-            capabilities: [gpu]
-    runtime: nvidia
-    entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml"
-    deploy:
-      restart_policy:
-        condition: on-failure
-        delay: 3s
-        max_attempts: 5
-        window: 60s
--- a/distributions/vllm-gpu/run.yaml
+++ b/distributions/vllm-gpu/run.yaml
@ -1,66 +0,0 @@
-version: '2'
-image_name: local
-container_image: null
-conda_env: local
-apis:
- shields
- agents
- models
- memory
- memory_banks
- inference
- safety
-providers:
-  inference:
-  - provider_id: vllm-inference
-    provider_type: inline::vllm
-    config:
-      model: Llama3.2-3B-Instruct
-      tensor_parallel_size: 1
-      gpu_memory_utilization: 0.4
-      enforce_eager: true
-      max_tokens: 4096
-  - provider_id: vllm-inference-safety
-    provider_type: inline::vllm
-    config:
-      model: Llama-Guard-3-1B
-      tensor_parallel_size: 1
-      gpu_memory_utilization: 0.2
-      enforce_eager: true
-      max_tokens: 4096
-  safety:
-  - provider_id: meta0
-    provider_type: inline::llama-guard
-    config:
-      model: Llama-Guard-3-1B
-      excluded_categories: []
-  # Uncomment to use prompt guard
-  # - provider_id: meta1
-  #   provider_type: inline::prompt-guard
-  #   config:
-  #     model: Prompt-Guard-86M
-  memory:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
-  # Uncomment to use pgvector
-  # - provider_id: pgvector
-  #   provider_type: remote::pgvector
-  #   config:
-  #     host: 127.0.0.1
-  #     port: 5432
-  #     db: postgres
-  #     user: postgres
-  #     password: mysecretpassword
-  agents:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config:
-      persistence_store:
-        namespace: null
-        type: sqlite
-        db_path: ~/.llama/runtime/agents_store.db
-  telemetry:
-  - provider_id: meta0
-    provider_type: inline::meta-reference
-    config: {}
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/bedrock/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/bedrock/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/cerebras/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/cerebras/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/fireworks/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/fireworks/run.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/meta-reference-gpu/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/ollama/build.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/ollama/run-with-safety.yaml`
				`@ -1 +0,0 @@`
				`../../llama_stack/templates/nvidia/build.yaml`