diff --git a/distributions/bedrock/build.yaml b/distributions/bedrock/build.yaml deleted file mode 120000 index 72402ef8d..000000000 --- a/distributions/bedrock/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/bedrock/build.yaml \ No newline at end of file diff --git a/distributions/bedrock/compose.yaml b/distributions/bedrock/compose.yaml deleted file mode 100644 index 055b92c67..000000000 --- a/distributions/bedrock/compose.yaml +++ /dev/null @@ -1,15 +0,0 @@ -services: - llamastack: - image: distribution-bedrock - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-bedrock.yaml - ports: - - "8321:8321" - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-bedrock.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/bedrock/run.yaml b/distributions/bedrock/run.yaml deleted file mode 120000 index f38abfc4e..000000000 --- a/distributions/bedrock/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/bedrock/run.yaml \ No newline at end of file diff --git a/distributions/cerebras/build.yaml b/distributions/cerebras/build.yaml deleted file mode 120000 index bccbbcf60..000000000 --- a/distributions/cerebras/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/cerebras/build.yaml \ No newline at end of file diff --git a/distributions/cerebras/compose.yaml b/distributions/cerebras/compose.yaml deleted file mode 100644 index 8dc09a865..000000000 --- a/distributions/cerebras/compose.yaml +++ /dev/null @@ -1,16 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-cerebras - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-cerebras.yaml - ports: - - "8321:8321" - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-cerebras.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/cerebras/run.yaml b/distributions/cerebras/run.yaml deleted file mode 120000 index 9f9d20b4b..000000000 --- a/distributions/cerebras/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/cerebras/run.yaml \ No newline at end of file diff --git a/distributions/dell-tgi/compose.yaml b/distributions/dell-tgi/compose.yaml deleted file mode 100644 index d26636cbd..000000000 --- a/distributions/dell-tgi/compose.yaml +++ /dev/null @@ -1,50 +0,0 @@ -services: - text-generation-inference: - image: registry.dell.huggingface.co/enterprise-dell-inference-meta-llama-meta-llama-3.1-8b-instruct - network_mode: "host" - volumes: - - $HOME/.cache/huggingface:/data - ports: - - "5009:5009" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0,1,2,3,4 - - NUM_SHARD=4 - - MAX_BATCH_PREFILL_TOKENS=32768 - - MAX_INPUT_TOKENS=8000 - - MAX_TOTAL_TOKENS=8192 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: all - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - runtime: nvidia - llamastack: - depends_on: - text-generation-inference: - condition: service_healthy - image: llamastack/distribution-tgi - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - # Link to TGI run.yaml file - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - # Hack: wait for TGI server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/dell-tgi/run.yaml b/distributions/dell-tgi/run.yaml deleted file mode 100644 index cd6ddcfdf..000000000 --- a/distributions/dell-tgi/run.yaml +++ /dev/null @@ -1,44 +0,0 @@ -version: '2' -image_name: local -container_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:80 - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: inline::faiss - config: {} - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} diff --git a/distributions/fireworks/build.yaml b/distributions/fireworks/build.yaml deleted file mode 120000 index 32a5bd869..000000000 --- a/distributions/fireworks/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/fireworks/build.yaml \ No newline at end of file diff --git a/distributions/fireworks/compose.yaml b/distributions/fireworks/compose.yaml deleted file mode 100644 index 84b8491e4..000000000 --- a/distributions/fireworks/compose.yaml +++ /dev/null @@ -1,14 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-fireworks - ports: - - "8321:8321" - environment: - - FIREWORKS_API_KEY=${FIREWORKS_API_KEY} - entrypoint: bash -c "python -m llama_stack.distribution.server.server --template fireworks" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/fireworks/run.yaml b/distributions/fireworks/run.yaml deleted file mode 120000 index 532e0e2a8..000000000 --- a/distributions/fireworks/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/fireworks/run.yaml \ No newline at end of file diff --git a/distributions/meta-reference-gpu/build.yaml b/distributions/meta-reference-gpu/build.yaml deleted file mode 120000 index 4418195eb..000000000 --- a/distributions/meta-reference-gpu/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-gpu/build.yaml \ No newline at end of file diff --git a/distributions/meta-reference-gpu/compose.yaml b/distributions/meta-reference-gpu/compose.yaml deleted file mode 100644 index d977e92ea..000000000 --- a/distributions/meta-reference-gpu/compose.yaml +++ /dev/null @@ -1,34 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-meta-reference-gpu - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s - runtime: nvidia - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" diff --git a/distributions/meta-reference-gpu/run-with-safety.yaml b/distributions/meta-reference-gpu/run-with-safety.yaml deleted file mode 120000 index 4c5483425..000000000 --- a/distributions/meta-reference-gpu/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-gpu/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/meta-reference-gpu/run.yaml b/distributions/meta-reference-gpu/run.yaml deleted file mode 120000 index d680186ab..000000000 --- a/distributions/meta-reference-gpu/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-gpu/run.yaml \ No newline at end of file diff --git a/distributions/meta-reference-quantized-gpu/build.yaml b/distributions/meta-reference-quantized-gpu/build.yaml deleted file mode 120000 index f3dbe996f..000000000 --- a/distributions/meta-reference-quantized-gpu/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/meta-reference-quantized-gpu/build.yaml \ No newline at end of file diff --git a/distributions/meta-reference-quantized-gpu/compose.yaml b/distributions/meta-reference-quantized-gpu/compose.yaml deleted file mode 100644 index 98e943dce..000000000 --- a/distributions/meta-reference-quantized-gpu/compose.yaml +++ /dev/null @@ -1,35 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-meta-reference-quantized-gpu - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - runtime: nvidia - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/meta-reference-quantized-gpu/run.yaml b/distributions/meta-reference-quantized-gpu/run.yaml deleted file mode 100644 index eb631adaa..000000000 --- a/distributions/meta-reference-quantized-gpu/run.yaml +++ /dev/null @@ -1,58 +0,0 @@ -version: '2' -image_name: local -container_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: meta0 - provider_type: inline::meta-reference-quantized - config: - model: Llama3.2-3B-Instruct:int4-qlora-eo8 - quantization: - type: int4 - torch_seed: null - max_seq_len: 2048 - max_batch_size: 1 - - provider_id: meta1 - provider_type: inline::meta-reference-quantized - config: - # not a quantized model ! - model: Llama-Guard-3-1B - quantization: null - torch_seed: null - max_seq_len: 2048 - max_batch_size: 1 - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} diff --git a/distributions/ollama/build.yaml b/distributions/ollama/build.yaml deleted file mode 120000 index 8772548e0..000000000 --- a/distributions/ollama/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/ollama/build.yaml \ No newline at end of file diff --git a/distributions/ollama/compose.yaml b/distributions/ollama/compose.yaml deleted file mode 100644 index 06e6c1359..000000000 --- a/distributions/ollama/compose.yaml +++ /dev/null @@ -1,71 +0,0 @@ -services: - ollama: - image: ollama/ollama:latest - network_mode: ${NETWORK_MODE:-bridge} - volumes: - - ~/.ollama:/root/.ollama - ports: - - "11434:11434" - environment: - OLLAMA_DEBUG: 1 - command: [] - deploy: - resources: - limits: - memory: 8G # Set maximum memory - reservations: - memory: 8G # Set minimum memory reservation - # healthcheck: - # # ugh, no CURL in ollama image - # test: ["CMD", "curl", "-f", "http://ollama:11434"] - # interval: 10s - # timeout: 5s - # retries: 5 - - ollama-init: - image: ollama/ollama:latest - depends_on: - - ollama - # condition: service_healthy - network_mode: ${NETWORK_MODE:-bridge} - environment: - - OLLAMA_HOST=ollama - - INFERENCE_MODEL=${INFERENCE_MODEL} - - SAFETY_MODEL=${SAFETY_MODEL:-} - volumes: - - ~/.ollama:/root/.ollama - - ./pull-models.sh:/pull-models.sh - entrypoint: ["/pull-models.sh"] - - llamastack: - depends_on: - ollama: - condition: service_started - ollama-init: - condition: service_started - image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama} - network_mode: ${NETWORK_MODE:-bridge} - volumes: - - ~/.llama:/root/.llama - # Link to ollama run.yaml file - - ~/local/llama-stack/:/app/llama-stack-source - - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml - ports: - - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}" - environment: - - INFERENCE_MODEL=${INFERENCE_MODEL} - - SAFETY_MODEL=${SAFETY_MODEL:-} - - OLLAMA_URL=http://ollama:11434 - entrypoint: > - python -m llama_stack.distribution.server.server /root/my-run.yaml \ - --port ${LLAMA_STACK_PORT:-8321} - deploy: - restart_policy: - condition: on-failure - delay: 10s - max_attempts: 3 - window: 60s -volumes: - ollama: - ollama-init: - llamastack: diff --git a/distributions/ollama/pull-models.sh b/distributions/ollama/pull-models.sh deleted file mode 100755 index fb5bf8a4a..000000000 --- a/distributions/ollama/pull-models.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh - -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..." -for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do - echo "Preloading $model..." - if ! ollama run "$model"; then - echo "Failed to pull and run $model" - exit 1 - fi -done - -echo "All models pulled successfully" diff --git a/distributions/ollama/run-with-safety.yaml b/distributions/ollama/run-with-safety.yaml deleted file mode 120000 index 5695b49e7..000000000 --- a/distributions/ollama/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/ollama/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/ollama/run.yaml b/distributions/ollama/run.yaml deleted file mode 120000 index b008b1bf4..000000000 --- a/distributions/ollama/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/ollama/run.yaml \ No newline at end of file diff --git a/distributions/ramalama/faiss_store.db b/distributions/ramalama/faiss_store.db deleted file mode 100644 index 573e60e90..000000000 Binary files a/distributions/ramalama/faiss_store.db and /dev/null differ diff --git a/distributions/remote-nvidia/build.yaml b/distributions/remote-nvidia/build.yaml deleted file mode 120000 index 8903d2e57..000000000 --- a/distributions/remote-nvidia/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/nvidia/build.yaml \ No newline at end of file diff --git a/distributions/remote-nvidia/compose.yaml b/distributions/remote-nvidia/compose.yaml deleted file mode 100644 index ab8b4ce25..000000000 --- a/distributions/remote-nvidia/compose.yaml +++ /dev/null @@ -1,19 +0,0 @@ -services: - llamastack: - image: distribution-nvidia:dev - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-nvidia.yaml - ports: - - "8321:8321" - environment: - - INFERENCE_MODEL=${INFERENCE_MODEL:-Llama3.1-8B-Instruct} - - NVIDIA_API_KEY=${NVIDIA_API_KEY:-} - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml-config /root/llamastack-run-nvidia.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/remote-nvidia/run.yaml b/distributions/remote-nvidia/run.yaml deleted file mode 120000 index 85da3e26b..000000000 --- a/distributions/remote-nvidia/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/nvidia/run.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/build.yaml b/distributions/remote-vllm/build.yaml deleted file mode 120000 index 52e5d0f2d..000000000 --- a/distributions/remote-vllm/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/remote-vllm/build.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/compose.yaml b/distributions/remote-vllm/compose.yaml deleted file mode 100644 index 8b6e11b3a..000000000 --- a/distributions/remote-vllm/compose.yaml +++ /dev/null @@ -1,99 +0,0 @@ -services: - vllm-inference: - image: vllm/vllm-openai:latest - volumes: - - $HOME/.cache/huggingface:/root/.cache/huggingface - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${VLLM_INFERENCE_PORT:-5100}:${VLLM_INFERENCE_PORT:-5100}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${VLLM_INFERENCE_GPU:-0} - - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN - command: > - --gpu-memory-utilization 0.75 - --model ${VLLM_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - --enforce-eager - --max-model-len 8192 - --max-num-seqs 16 - --port ${VLLM_INFERENCE_PORT:-5100} - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:${VLLM_INFERENCE_PORT:-5100}/v1/health"] - interval: 30s - timeout: 10s - retries: 5 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - - # A little trick: - # if VLLM_SAFETY_MODEL is set, we will create a service for the safety model - # otherwise, the entry will end in a hyphen which gets ignored by docker compose - vllm-${VLLM_SAFETY_MODEL:+safety}: - image: vllm/vllm-openai:latest - volumes: - - $HOME/.cache/huggingface:/root/.cache/huggingface - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${VLLM_SAFETY_PORT:-5101}:${VLLM_SAFETY_PORT:-5101}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${VLLM_SAFETY_GPU:-1} - - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN - command: > - --gpu-memory-utilization 0.75 - --model ${VLLM_SAFETY_MODEL} - --enforce-eager - --max-model-len 8192 - --max-num-seqs 16 - --port ${VLLM_SAFETY_PORT:-5101} - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:${VLLM_SAFETY_PORT:-5101}/v1/health"] - interval: 30s - timeout: 10s - retries: 5 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - llamastack: - depends_on: - - vllm-inference: - condition: service_healthy - - vllm-${VLLM_SAFETY_MODEL:+safety}: - condition: service_healthy - image: llamastack/distribution-remote-vllm:test-0.0.52rc3 - volumes: - - ~/.llama:/root/.llama - - ./run${VLLM_SAFETY_MODEL:+-with-safety}.yaml:/root/llamastack-run-remote-vllm.yaml - network_mode: ${NETWORK_MODE:-bridged} - environment: - - VLLM_URL=http://vllm-inference:${VLLM_INFERENCE_PORT:-5100}/v1 - - VLLM_SAFETY_URL=http://vllm-safety:${VLLM_SAFETY_PORT:-5101}/v1 - - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - - MAX_TOKENS=${MAX_TOKENS:-4096} - - SQLITE_STORE_DIR=${SQLITE_STORE_DIR:-$HOME/.llama/distributions/remote-vllm} - - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} - ports: - - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}" - # Hack: wait for vLLM server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 8321" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s -volumes: - vllm-inference: - vllm-safety: - llamastack: diff --git a/distributions/remote-vllm/run-with-safety.yaml b/distributions/remote-vllm/run-with-safety.yaml deleted file mode 120000 index b2c3c36da..000000000 --- a/distributions/remote-vllm/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/remote-vllm/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml deleted file mode 120000 index ac70c0e6a..000000000 --- a/distributions/remote-vllm/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/remote-vllm/run.yaml \ No newline at end of file diff --git a/distributions/runpod/build.yaml b/distributions/runpod/build.yaml deleted file mode 100644 index 9348573ef..000000000 --- a/distributions/runpod/build.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: runpod -distribution_spec: - description: Use Runpod for running LLM inference - providers: - inference: remote::runpod - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference diff --git a/distributions/sambanova/build.yaml b/distributions/sambanova/build.yaml deleted file mode 100644 index dbf013d2d..000000000 --- a/distributions/sambanova/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/sambanova/build.yaml diff --git a/distributions/sambanova/compose.yaml b/distributions/sambanova/compose.yaml deleted file mode 100644 index 58b9fb1ef..000000000 --- a/distributions/sambanova/compose.yaml +++ /dev/null @@ -1,16 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-sambanova - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/llamastack-run-sambanova.yaml - ports: - - "5000:5000" - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-sambanova.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/sambanova/run.yaml b/distributions/sambanova/run.yaml deleted file mode 100644 index 385282c67..000000000 --- a/distributions/sambanova/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/sambanova/run.yaml diff --git a/distributions/tgi/build.yaml b/distributions/tgi/build.yaml deleted file mode 120000 index 73e59ad84..000000000 --- a/distributions/tgi/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/tgi/build.yaml \ No newline at end of file diff --git a/distributions/tgi/compose.yaml b/distributions/tgi/compose.yaml deleted file mode 100644 index d7b3bc77e..000000000 --- a/distributions/tgi/compose.yaml +++ /dev/null @@ -1,103 +0,0 @@ -services: - tgi-inference: - image: ghcr.io/huggingface/text-generation-inference:latest - volumes: - - $HOME/.cache/huggingface:/data - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${TGI_INFERENCE_PORT:-8080}:${TGI_INFERENCE_PORT:-8080}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${TGI_INFERENCE_GPU:-0} - - HF_TOKEN=$HF_TOKEN - - HF_HOME=/data - - HF_DATASETS_CACHE=/data - - HF_MODULES_CACHE=/data - - HF_HUB_CACHE=/data - command: > - --dtype bfloat16 - --usage-stats off - --sharded false - --model-id ${TGI_INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - --port ${TGI_INFERENCE_PORT:-8080} - --cuda-memory-fraction 0.75 - healthcheck: - test: ["CMD", "curl", "-f", "http://tgi-inference:${TGI_INFERENCE_PORT:-8080}/health"] - interval: 5s - timeout: 5s - retries: 30 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - - tgi-${TGI_SAFETY_MODEL:+safety}: - image: ghcr.io/huggingface/text-generation-inference:latest - volumes: - - $HOME/.cache/huggingface:/data - network_mode: ${NETWORK_MODE:-bridged} - ports: - - "${TGI_SAFETY_PORT:-8081}:${TGI_SAFETY_PORT:-8081}" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=${TGI_SAFETY_GPU:-1} - - HF_TOKEN=$HF_TOKEN - - HF_HOME=/data - - HF_DATASETS_CACHE=/data - - HF_MODULES_CACHE=/data - - HF_HUB_CACHE=/data - command: > - --dtype bfloat16 - --usage-stats off - --sharded false - --model-id ${TGI_SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} - --port ${TGI_SAFETY_PORT:-8081} - --cuda-memory-fraction 0.75 - healthcheck: - test: ["CMD", "curl", "-f", "http://tgi-safety:${TGI_SAFETY_PORT:-8081}/health"] - interval: 5s - timeout: 5s - retries: 30 - deploy: - resources: - reservations: - devices: - - driver: nvidia - capabilities: [gpu] - runtime: nvidia - - llamastack: - depends_on: - tgi-inference: - condition: service_healthy - tgi-${TGI_SAFETY_MODEL:+safety}: - condition: service_healthy - image: llamastack/distribution-tgi:test-0.0.52rc3 - network_mode: ${NETWORK_MODE:-bridged} - volumes: - - ~/.llama:/root/.llama - - ./run${TGI_SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml - ports: - - "${LLAMA_STACK_PORT:-8321}:${LLAMA_STACK_PORT:-8321}" - # Hack: wait for TGI server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s - environment: - - TGI_URL=http://tgi-inference:${TGI_INFERENCE_PORT:-8080} - - SAFETY_TGI_URL=http://tgi-safety:${TGI_SAFETY_PORT:-8081} - - INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} - - SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} - -volumes: - tgi-inference: - tgi-safety: - llamastack: diff --git a/distributions/tgi/run-with-safety.yaml b/distributions/tgi/run-with-safety.yaml deleted file mode 120000 index 62d26708e..000000000 --- a/distributions/tgi/run-with-safety.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/tgi/run-with-safety.yaml \ No newline at end of file diff --git a/distributions/tgi/run.yaml b/distributions/tgi/run.yaml deleted file mode 120000 index f3cc3a502..000000000 --- a/distributions/tgi/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/tgi/run.yaml \ No newline at end of file diff --git a/distributions/together/build.yaml b/distributions/together/build.yaml deleted file mode 120000 index 3877a9c96..000000000 --- a/distributions/together/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/together/build.yaml \ No newline at end of file diff --git a/distributions/together/compose.yaml b/distributions/together/compose.yaml deleted file mode 100644 index f66ee69f9..000000000 --- a/distributions/together/compose.yaml +++ /dev/null @@ -1,14 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-together - ports: - - "8321:8321" - environment: - - TOGETHER_API_KEY=${TOGETHER_API_KEY} - entrypoint: bash -c "python -m llama_stack.distribution.server.server --template together" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/together/run.yaml b/distributions/together/run.yaml deleted file mode 120000 index 102d9866e..000000000 --- a/distributions/together/run.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/together/run.yaml \ No newline at end of file diff --git a/distributions/vllm-gpu/build.yaml b/distributions/vllm-gpu/build.yaml deleted file mode 120000 index a95d34c1f..000000000 --- a/distributions/vllm-gpu/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/inline-vllm/build.yaml \ No newline at end of file diff --git a/distributions/vllm-gpu/compose.yaml b/distributions/vllm-gpu/compose.yaml deleted file mode 100644 index 98267cdc3..000000000 --- a/distributions/vllm-gpu/compose.yaml +++ /dev/null @@ -1,35 +0,0 @@ -services: - llamastack: - image: llamastack/distribution-inline-vllm - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - - ./run.yaml:/root/my-run.yaml - ports: - - "8321:8321" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - runtime: nvidia - entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s diff --git a/distributions/vllm-gpu/run.yaml b/distributions/vllm-gpu/run.yaml deleted file mode 100644 index a75a4c451..000000000 --- a/distributions/vllm-gpu/run.yaml +++ /dev/null @@ -1,66 +0,0 @@ -version: '2' -image_name: local -container_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: vllm-inference - provider_type: inline::vllm - config: - model: Llama3.2-3B-Instruct - tensor_parallel_size: 1 - gpu_memory_utilization: 0.4 - enforce_eager: true - max_tokens: 4096 - - provider_id: vllm-inference-safety - provider_type: inline::vllm - config: - model: Llama-Guard-3-1B - tensor_parallel_size: 1 - gpu_memory_utilization: 0.2 - enforce_eager: true - max_tokens: 4096 - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - # Uncomment to use prompt guard - # - provider_id: meta1 - # provider_type: inline::prompt-guard - # config: - # model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} - # Uncomment to use pgvector - # - provider_id: pgvector - # provider_type: remote::pgvector - # config: - # host: 127.0.0.1 - # port: 5432 - # db: postgres - # user: postgres - # password: mysecretpassword - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/agents_store.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {}