diff --git a/distributions/meta-reference-quantized-gpu/compose.yaml b/distributions/meta-reference-quantized-gpu/compose.yaml new file mode 100644 index 000000000..f9fe9f45d --- /dev/null +++ b/distributions/meta-reference-quantized-gpu/compose.yaml @@ -0,0 +1,35 @@ +services: + llamastack: + image: llamastack/distribution-meta-reference-quantized-gpu + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/my-run.yaml + ports: + - "5000:5000" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=0 + command: [] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s