From 02c66b49fcab760560b32b038bd404a317ac84fc Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Fri, 8 Nov 2024 11:32:06 -0800 Subject: [PATCH] remote vllm distro --- .../{vllm => inline-vllm}/build.yaml | 0 distributions/ollama-gpu/build.yaml | 1 + .../{ollama/gpu => ollama-gpu}/compose.yaml | 0 .../{ollama/cpu => ollama-gpu}/run.yaml | 0 distributions/ollama/{cpu => }/compose.yaml | 0 distributions/ollama/{gpu => }/run.yaml | 0 distributions/remote-vllm/build.yaml | 1 + distributions/remote-vllm/cpu/compose.yaml | 30 ++++++++++++ distributions/remote-vllm/cpu/run.yaml | 46 ++++++++++++++++++ distributions/remote-vllm/gpu/compose.yaml | 48 +++++++++++++++++++ distributions/remote-vllm/gpu/run.yaml | 46 ++++++++++++++++++ .../self_hosted_distro/ollama.md | 28 +++++------ docs/source/getting_started/index.md | 6 ++- 13 files changed, 188 insertions(+), 18 deletions(-) rename distributions/{vllm => inline-vllm}/build.yaml (100%) create mode 120000 distributions/ollama-gpu/build.yaml rename distributions/{ollama/gpu => ollama-gpu}/compose.yaml (100%) rename distributions/{ollama/cpu => ollama-gpu}/run.yaml (100%) rename distributions/ollama/{cpu => }/compose.yaml (100%) rename distributions/ollama/{gpu => }/run.yaml (100%) create mode 120000 distributions/remote-vllm/build.yaml create mode 100644 distributions/remote-vllm/cpu/compose.yaml create mode 100644 distributions/remote-vllm/cpu/run.yaml create mode 100644 distributions/remote-vllm/gpu/compose.yaml create mode 100644 distributions/remote-vllm/gpu/run.yaml diff --git a/distributions/vllm/build.yaml b/distributions/inline-vllm/build.yaml similarity index 100% rename from distributions/vllm/build.yaml rename to distributions/inline-vllm/build.yaml diff --git a/distributions/ollama-gpu/build.yaml b/distributions/ollama-gpu/build.yaml new file mode 120000 index 000000000..8772548e0 --- /dev/null +++ b/distributions/ollama-gpu/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/ollama/build.yaml \ No newline at end of file diff --git a/distributions/ollama/gpu/compose.yaml b/distributions/ollama-gpu/compose.yaml similarity index 100% rename from distributions/ollama/gpu/compose.yaml rename to distributions/ollama-gpu/compose.yaml diff --git a/distributions/ollama/cpu/run.yaml b/distributions/ollama-gpu/run.yaml similarity index 100% rename from distributions/ollama/cpu/run.yaml rename to distributions/ollama-gpu/run.yaml diff --git a/distributions/ollama/cpu/compose.yaml b/distributions/ollama/compose.yaml similarity index 100% rename from distributions/ollama/cpu/compose.yaml rename to distributions/ollama/compose.yaml diff --git a/distributions/ollama/gpu/run.yaml b/distributions/ollama/run.yaml similarity index 100% rename from distributions/ollama/gpu/run.yaml rename to distributions/ollama/run.yaml diff --git a/distributions/remote-vllm/build.yaml b/distributions/remote-vllm/build.yaml new file mode 120000 index 000000000..8772548e0 --- /dev/null +++ b/distributions/remote-vllm/build.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/ollama/build.yaml \ No newline at end of file diff --git a/distributions/remote-vllm/cpu/compose.yaml b/distributions/remote-vllm/cpu/compose.yaml new file mode 100644 index 000000000..dc51d4759 --- /dev/null +++ b/distributions/remote-vllm/cpu/compose.yaml @@ -0,0 +1,30 @@ +services: + ollama: + image: ollama/ollama:latest + network_mode: "host" + volumes: + - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast + ports: + - "11434:11434" + command: [] + llamastack: + depends_on: + - ollama + image: llamastack/distribution-ollama + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to ollama run.yaml file + - ./run.yaml:/root/my-run.yaml + ports: + - "5000:5000" + # Hack: wait for ollama server to start before starting docker + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s +volumes: + ollama: diff --git a/distributions/remote-vllm/cpu/run.yaml b/distributions/remote-vllm/cpu/run.yaml new file mode 100644 index 000000000..798dabc0b --- /dev/null +++ b/distributions/remote-vllm/cpu/run.yaml @@ -0,0 +1,46 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: ollama0 + provider_type: remote::ollama + config: + url: http://127.0.0.1:14343 + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/distributions/remote-vllm/gpu/compose.yaml b/distributions/remote-vllm/gpu/compose.yaml new file mode 100644 index 000000000..c965c43c7 --- /dev/null +++ b/distributions/remote-vllm/gpu/compose.yaml @@ -0,0 +1,48 @@ +services: + ollama: + image: ollama/ollama:latest + network_mode: "host" + volumes: + - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast + ports: + - "11434:11434" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=0 + command: [] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + llamastack: + depends_on: + - ollama + image: llamastack/distribution-ollama + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + # Link to ollama run.yaml file + - ./run.yaml:/root/llamastack-run-ollama.yaml + ports: + - "5000:5000" + # Hack: wait for ollama server to start before starting docker + entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s +volumes: + ollama: diff --git a/distributions/remote-vllm/gpu/run.yaml b/distributions/remote-vllm/gpu/run.yaml new file mode 100644 index 000000000..798dabc0b --- /dev/null +++ b/distributions/remote-vllm/gpu/run.yaml @@ -0,0 +1,46 @@ +version: '2' +built_at: '2024-10-08T17:40:45.325529' +image_name: local +docker_image: null +conda_env: local +apis: +- shields +- agents +- models +- memory +- memory_banks +- inference +- safety +providers: + inference: + - provider_id: ollama0 + provider_type: remote::ollama + config: + url: http://127.0.0.1:14343 + safety: + - provider_id: meta0 + provider_type: meta-reference + config: + llama_guard_shield: + model: Llama-Guard-3-1B + excluded_categories: [] + disable_input_check: false + disable_output_check: false + prompt_guard_shield: + model: Prompt-Guard-86M + memory: + - provider_id: meta0 + provider_type: meta-reference + config: {} + agents: + - provider_id: meta0 + provider_type: meta-reference + config: + persistence_store: + namespace: null + type: sqlite + db_path: ~/.llama/runtime/kvstore.db + telemetry: + - provider_id: meta0 + provider_type: meta-reference + config: {} diff --git a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md index 0d4d90ee6..03bc3eb63 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md @@ -7,16 +7,22 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | **Provider(s)** | remote::ollama | meta-reference | remote::pgvector, remote::chroma | remote::ollama | meta-reference | -### Docker: Start a Distribution (Single Node GPU) +### Docker: Start the Distribution (Single Node regular Desktop machine) + +> [!NOTE] +> This will start an ollama server with CPU only, please see [Ollama Documentations](https://github.com/ollama/ollama) for serving models on CPU only. + +``` +$ cd distributions/ollama; docker compose up +``` + +### Docker: Start a Distribution (Single Node with nvidia GPUs) > [!NOTE] > This assumes you have access to GPU to start a Ollama server with access to your GPU. ``` -$ cd distributions/ollama/gpu -$ ls -compose.yaml run.yaml -$ docker compose up +$ cd distributions/ollama-gpu; docker compose up ``` You will see outputs similar to following --- @@ -38,18 +44,6 @@ To kill the server docker compose down ``` -### Docker: Start the Distribution (Single Node CPU) - -> [!NOTE] -> This will start an ollama server with CPU only, please see [Ollama Documentations](https://github.com/ollama/ollama) for serving models on CPU only. - -``` -$ cd distributions/ollama/cpu -$ ls -compose.yaml run.yaml -$ docker compose up -``` - ### Conda: ollama run + llama stack run If you wish to separately spin up a Ollama server, and connect with Llama Stack, you may use the following commands. diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index 92643d87e..afe26b4bd 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -144,7 +144,11 @@ docker compose down :::{tab-item} ollama ``` -$ cd llama-stack/distributions/ollama/cpu && docker compose up +$ cd llama-stack/distributions/ollama && docker compose up + +# OR + +$ cd llama-stack/distributions/ollama-gpu && docker compose up ``` You will see outputs similar to following ---