diff --git a/distributions/remote-vllm/compose.yaml b/distributions/remote-vllm/compose.yaml index a83ed79fc..88d10f5b4 100644 --- a/distributions/remote-vllm/compose.yaml +++ b/distributions/remote-vllm/compose.yaml @@ -1,43 +1,83 @@ +# NOTES: +# +# This Docker Compose (and the associated run.yaml) assumes you will be +# running in the default "bridged" network mode. +# +# If you need "host" network mode, please uncomment +# - network_mode: "host" +# and comment the lines with port mapping +# - ports: +# - "5100:5100" +# +# Similarly change "host.docker.internal" to "localhost" in the run.yaml file +# services: - vllm: + vllm-0: image: vllm/vllm-openai:latest - network_mode: "host" volumes: - $HOME/.cache/huggingface:/root/.cache/huggingface + # network_mode: "host" ports: - - "8000:8000" + - "5100:5100" devices: - nvidia.com/gpu=all environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] + - CUDA_VISIBLE_DEVICES=4 + - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN + command: > + --gpu-memory-utilization 0.75 + --model meta-llama/Llama-3.1-8B-Instruct + --enforce-eager + --max-model-len 8192 + --max-num-seqs 16 + --port 5100 + deploy: + resources: + reservations: + devices: + - driver: nvidia + capabilities: [gpu] + runtime: nvidia + vllm-1: + image: vllm/vllm-openai:latest + volumes: + - $HOME/.cache/huggingface:/root/.cache/huggingface + # network_mode: "host" + ports: + - "5101:5101" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=5 + - HUGGING_FACE_HUB_TOKEN=$HF_TOKEN + command: > + --gpu-memory-utilization 0.75 + --model meta-llama/Llama-Guard-3-1B + --enforce-eager + --max-model-len 8192 + --max-num-seqs 16 + --port 5101 deploy: resources: reservations: devices: - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. capabilities: [gpu] runtime: nvidia llamastack: depends_on: - - vllm - image: llamastack/distribution-remote-vllm - network_mode: "host" + - vllm-0 + - vllm-1 + # image: llamastack/distribution-remote-vllm + image: localhost/distribution-remote-vllm:test-0.0.52rc3 volumes: - ~/.llama:/root/.llama - # Link to ollama run.yaml file - - ./run.yaml:/root/llamastack-run-remote-vllm.yaml + - ~/local/llama-stack/distributions/remote-vllm/run.yaml:/root/llamastack-run-remote-vllm.yaml + # network_mode: "host" ports: - - "5000:5000" - # Hack: wait for vllm server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml" + - "5001:5001" + # Hack: wait for vLLM server to start before starting docker + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml --port 5001" deploy: restart_policy: condition: on-failure @@ -45,4 +85,6 @@ services: max_attempts: 5 window: 60s volumes: - vllm: + vllm-0: + vllm-1: + llamastack: diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml index 4c0a25f56..af02b1ba5 100644 --- a/distributions/remote-vllm/run.yaml +++ b/distributions/remote-vllm/run.yaml @@ -1,35 +1,47 @@ version: '2' -built_at: '2024-10-08T17:40:45.325529' -image_name: local -docker_image: null -conda_env: local +built_at: '2024-11-11T20:09:45.988375' +image_name: remote-vllm +docker_image: remote-vllm +conda_env: null apis: -- shields -- agents -- models -- memory -- memory_banks - inference +- memory - safety +- agents +- telemetry providers: inference: - - provider_id: vllm0 + # serves main inference model + - provider_id: vllm-0 provider_type: remote::vllm config: - url: http://127.0.0.1:8000 + # NOTE: replace with "localhost" if you are running in "host" network mode + url: http://host.docker.internal:5100/v1 + max_tokens: 4096 + api_token: fake + # serves safety llama_guard model + - provider_id: vllm-1 + provider_type: remote::vllm + config: + # NOTE: replace with "localhost" if you are running in "host" network mode + url: http://host.docker.internal:5101/v1 + max_tokens: 4096 + api_token: fake + memory: + - provider_id: faiss-0 + provider_type: inline::faiss + config: + kvstore: + namespace: null + type: sqlite + db_path: /home/ashwin/.llama/distributions/remote-vllm/faiss_store.db safety: - - provider_id: meta0 + - provider_id: llama-guard provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M + config: {} memory: - provider_id: meta0 - provider_type: inline::meta-reference + provider_type: inline::faiss config: {} agents: - provider_id: meta0 @@ -38,8 +50,19 @@ providers: persistence_store: namespace: null type: sqlite - db_path: ~/.llama/runtime/kvstore.db + db_path: /home/ashwin/.llama/distributions/remote-vllm/agents_store.db telemetry: - provider_id: meta0 provider_type: inline::meta-reference config: {} +metadata_store: + namespace: null + type: sqlite + db_path: /home/ashwin/.llama/distributions/remote-vllm/registry.db +models: + - model_id: Llama3.1-8B-Instruct + provider_id: vllm-0 + - model_id: Llama-Guard-3-1B + provider_id: vllm-1 +shields: + - shield_id: Llama-Guard-3-1B