From 38cdbdec5a8fa7eb6aa7e19e526a3d3c1608c3c5 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 8 Nov 2024 12:01:05 -0800 Subject: [PATCH] add inline-vllm details, fix things --- distributions/inline-vllm/build.yaml | 2 +- distributions/inline-vllm/compose.yaml | 35 ++++++++++ distributions/inline-vllm/run.yaml | 66 +++++++++++++++++++ distributions/remote-vllm/build.yaml | 2 +- .../remote-vllm/{gpu => }/compose.yaml | 20 +++--- distributions/remote-vllm/cpu/compose.yaml | 30 --------- distributions/remote-vllm/gpu/run.yaml | 46 ------------- distributions/remote-vllm/{cpu => }/run.yaml | 6 +- llama_stack/providers/registry/inference.py | 2 +- llama_stack/templates/inline-vllm/build.yaml | 13 ++++ llama_stack/templates/remote-vllm/build.yaml | 12 ++++ llama_stack/templates/vllm/build.yaml | 9 --- 12 files changed, 142 insertions(+), 101 deletions(-) create mode 100644 distributions/inline-vllm/compose.yaml create mode 100644 distributions/inline-vllm/run.yaml rename distributions/remote-vllm/{gpu => }/compose.yaml (70%) delete mode 100644 distributions/remote-vllm/cpu/compose.yaml delete mode 100644 distributions/remote-vllm/gpu/run.yaml rename distributions/remote-vllm/{cpu => }/run.yaml (90%) create mode 100644 llama_stack/templates/inline-vllm/build.yaml create mode 100644 llama_stack/templates/remote-vllm/build.yaml delete mode 100644 llama_stack/templates/vllm/build.yaml diff --git a/distributions/inline-vllm/build.yaml b/distributions/inline-vllm/build.yaml index dfc9401b6..a95d34c1f 120000 --- a/distributions/inline-vllm/build.yaml +++ b/distributions/inline-vllm/build.yaml @@ -1 +1 @@ -../../llama_stack/templates/vllm/build.yaml \ No newline at end of file +../../llama_stack/templates/inline-vllm/build.yaml \ No newline at end of file diff --git a/distributions/inline-vllm/compose.yaml b/distributions/inline-vllm/compose.yaml new file mode 100644 index 000000000..f8779c9ce --- /dev/null +++ b/distributions/inline-vllm/compose.yaml @@ -0,0 +1,35 @@ +services: + llamastack: + image: llamastack/distribution-inline-vllm + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/my-run.yaml + ports: + - "5000:5000" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=0 + command: [] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/distributions/inline-vllm/run.yaml b/distributions/inline-vllm/run.yaml new file mode 100644 index 000000000..aadf5c0ce --- /dev/null +++ b/distributions/inline-vllm/run.yaml @@ -0,0 +1,66 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: vllm-inference + provider_type: inline::vllm + config: + model: Llama3.2-3B-Instruct + tensor_parallel_size: 1 + gpu_memory_utilization: 0.4 + enforce_eager: true + max_tokens: 4096 + - provider_id: vllm-safety + provider_type: inline::vllm + config: + model: Llama-Guard-3-1B + tensor_parallel_size: 1 + gpu_memory_utilization: 0.2 + enforce_eager: true + max_tokens: 4096 + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] +# Uncomment to use prompt guard +# prompt_guard_shield: +# model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + # Uncomment to use pgvector + # - provider_id: pgvector + # provider_type: remote::pgvector + # config: + # host: 127.0.0.1 + # port: 5432 + # db: postgres + # user: postgres + # password: mysecretpassword + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/agents_store.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/distributions/remote-vllm/build.yaml b/distributions/remote-vllm/build.yaml index 8772548e0..52e5d0f2d 120000 --- a/distributions/remote-vllm/build.yaml +++ b/distributions/remote-vllm/build.yaml @@ -1 +1 @@ -../../llama_stack/templates/ollama/build.yaml \ No newline at end of file +../../llama_stack/templates/remote-vllm/build.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/gpu/compose.yaml b/distributions/remote-vllm/compose.yaml similarity index 70% rename from distributions/remote-vllm/gpu/compose.yaml rename to distributions/remote-vllm/compose.yaml index c965c43c7..a83ed79fc 100644 --- a/distributions/remote-vllm/gpu/compose.yaml +++ b/distributions/remote-vllm/compose.yaml @@ -1,11 +1,11 @@ services: - ollama: - image: ollama/ollama:latest + vllm: + image: vllm/vllm-openai:latest network_mode: "host" volumes: - - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast + - $HOME/.cache/huggingface:/root/.cache/huggingface ports: - - "11434:11434" + - "8000:8000" devices: - nvidia.com/gpu=all environment: @@ -27,17 +27,17 @@ services: runtime: nvidia llamastack: depends_on: - - ollama - image: llamastack/distribution-ollama + - vllm + image: llamastack/distribution-remote-vllm network_mode: "host" volumes: - ~/.llama:/root/.llama # Link to ollama run.yaml file - - ./run.yaml:/root/llamastack-run-ollama.yaml + - ./run.yaml:/root/llamastack-run-remote-vllm.yaml ports: - "5000:5000" - # Hack: wait for ollama server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml" + # Hack: wait for vllm server to start before starting docker + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-remote-vllm.yaml" deploy: restart_policy: condition: on-failure @@ -45,4 +45,4 @@ services: max_attempts: 5 window: 60s volumes: - ollama: + vllm: diff --git a/distributions/remote-vllm/cpu/compose.yaml b/distributions/remote-vllm/cpu/compose.yaml deleted file mode 100644 index dc51d4759..000000000 --- a/distributions/remote-vllm/cpu/compose.yaml +++ /dev/null @@ -1,30 +0,0 @@ -services: - ollama: - image: ollama/ollama:latest - network_mode: "host" - volumes: - - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast - ports: - - "11434:11434" - command: [] - llamastack: - depends_on: - - ollama - image: llamastack/distribution-ollama - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - # Link to ollama run.yaml file - - ./run.yaml:/root/my-run.yaml - ports: - - "5000:5000" - # Hack: wait for ollama server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s -volumes: - ollama: diff --git a/distributions/remote-vllm/gpu/run.yaml b/distributions/remote-vllm/gpu/run.yaml deleted file mode 100644 index 798dabc0b..000000000 --- a/distributions/remote-vllm/gpu/run.yaml +++ /dev/null @@ -1,46 +0,0 @@ -version: '2' -built_at: '2024-10-08T17:40:45.325529' -image_name: local -docker_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: ollama0 - provider_type: remote::ollama - config: - url: http://127.0.0.1:14343 - safety: - - provider_id: meta0 - provider_type: meta-reference - config: - llama_guard_shield: - model: Llama-Guard-3-1B - excluded_categories: [] - disable_input_check: false - disable_output_check: false - prompt_guard_shield: - model: Prompt-Guard-86M - memory: - - provider_id: meta0 - provider_type: meta-reference - config: {} - agents: - - provider_id: meta0 - provider_type: meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: meta-reference - config: {} diff --git a/distributions/remote-vllm/cpu/run.yaml b/distributions/remote-vllm/run.yaml similarity index 90% rename from distributions/remote-vllm/cpu/run.yaml rename to distributions/remote-vllm/run.yaml index 798dabc0b..2d0d36370 100644 --- a/distributions/remote-vllm/cpu/run.yaml +++ b/distributions/remote-vllm/run.yaml @@ -13,10 +13,10 @@ apis: - safety providers: inference: - - provider_id: ollama0 - provider_type: remote::ollama + - provider_id: vllm0 + provider_type: remote::vllm config: - url: http://127.0.0.1:14343 + url: http://127.0.0.1:8000 safety: - provider_id: meta0 provider_type: meta-reference diff --git a/llama_stack/providers/registry/inference.py b/llama_stack/providers/registry/inference.py index dc6fa9592..1d3eabe0d 100644 --- a/llama_stack/providers/registry/inference.py +++ b/llama_stack/providers/registry/inference.py @@ -45,7 +45,7 @@ def available_providers() -> List[ProviderSpec]: ), InlineProviderSpec( api=Api.inference, - provider_type="vllm", + provider_type="inline::vllm", pip_packages=[ "vllm", ], diff --git a/llama_stack/templates/inline-vllm/build.yaml b/llama_stack/templates/inline-vllm/build.yaml new file mode 100644 index 000000000..d0fe93aa3 --- /dev/null +++ b/llama_stack/templates/inline-vllm/build.yaml @@ -0,0 +1,13 @@ +name: meta-reference-gpu +distribution_spec: + docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime + description: Use code from `llama_stack` itself to serve all llama stack APIs + providers: + inference: meta-reference + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml new file mode 100644 index 000000000..ea95992f3 --- /dev/null +++ b/llama_stack/templates/remote-vllm/build.yaml @@ -0,0 +1,12 @@ +name: remote-vllm +distribution_spec: + description: Use (an external) vLLM server for running LLM inference + providers: + inference: remote::vllm + memory: + - meta-reference + - remote::chromadb + - remote::pgvector + safety: meta-reference + agents: meta-reference + telemetry: meta-reference diff --git a/llama_stack/templates/vllm/build.yaml b/llama_stack/templates/vllm/build.yaml deleted file mode 100644 index d842896db..000000000 --- a/llama_stack/templates/vllm/build.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: vllm -distribution_spec: - description: Like local, but use vLLM for running LLM inference - providers: - inference: vllm - memory: meta-reference - safety: meta-reference - agents: meta-reference - telemetry: meta-reference