From 985ff4d6cee2119d06d5acaa38737d0db5c60834 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Mon, 28 Oct 2024 15:10:40 -0700 Subject: [PATCH] update distributions/readmes --- distributions/fireworks/compose.yaml | 2 - distributions/meta-reference-gpu/README.md | 60 ++++++++++++++++++- distributions/meta-reference-gpu/compose.yaml | 35 +++++++++++ distributions/meta-reference-gpu/run.yaml | 11 +++- distributions/together/compose.yaml | 2 - 5 files changed, 102 insertions(+), 8 deletions(-) create mode 100644 distributions/meta-reference-gpu/compose.yaml diff --git a/distributions/fireworks/compose.yaml b/distributions/fireworks/compose.yaml index 552806745..71137c040 100644 --- a/distributions/fireworks/compose.yaml +++ b/distributions/fireworks/compose.yaml @@ -4,11 +4,9 @@ services: network_mode: "host" volumes: - ~/.llama:/root/.llama - # Link to ollama run.yaml file - ./run.yaml:/root/llamastack-run-fireworks.yaml ports: - "5000:5000" - # Hack: wait for ollama server to start before starting docker entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-fireworks.yaml" deploy: restart_policy: diff --git a/distributions/meta-reference-gpu/README.md b/distributions/meta-reference-gpu/README.md index 7f209c4a9..677ea71e5 100644 --- a/distributions/meta-reference-gpu/README.md +++ b/distributions/meta-reference-gpu/README.md @@ -10,6 +10,13 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo ### Start the Distribution (Single Node GPU) +``` +$ cd distributions/meta-reference-gpu +$ ls +build.yaml compose.yaml README.md run.yaml +$ docker compose up +``` + > [!NOTE] > This assumes you have access to GPU to start a local server with access to your GPU. @@ -18,7 +25,7 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo > `~/.llama` should be the path containing downloaded weights of Llama models. -To download and start running a pre-built docker container, you may use the following commands: +This will download and start running a pre-built docker container. Alternatively, you may use the following commands: ``` docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-gpu --yaml_config /root/my-run.yaml @@ -26,3 +33,54 @@ docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run. ### Alternative (Build and start distribution locally via conda) - You may checkout the [Getting Started](../../docs/getting_started.md) for more details on building locally via conda and starting up a meta-reference distribution. + +### Start Distribution With pgvector/chromadb Memory Provider +##### pgvector +1. Start running the pgvector server: + +``` +docker run --network host --name mypostgres -it -p 5432:5432 -e POSTGRES_PASSWORD=mysecretpassword -e POSTGRES_USER=postgres -e POSTGRES_DB=postgres pgvector/pgvector:pg16 +``` + +2. Edit the `run.yaml` file to point to the pgvector server. +``` +memory: + - provider_id: pgvector + provider_type: remote::pgvector + config: + host: 127.0.0.1 + port: 5432 + db: postgres + user: postgres + password: mysecretpassword +``` + +> [!NOTE] +> If you get a `RuntimeError: Vector extension is not installed.`. You will need to run `CREATE EXTENSION IF NOT EXISTS vector;` to include the vector extension. E.g. + +``` +docker exec -it mypostgres ./bin/psql -U postgres +postgres=# CREATE EXTENSION IF NOT EXISTS vector; +postgres=# SELECT extname from pg_extension; + extname +``` + +3. Run `docker compose up` with the updated `run.yaml` file. + +##### chromadb +1. Start running chromadb server +``` +docker run -it --network host --name chromadb -p 6000:6000 -v ./chroma_vdb:/chroma/chroma -e IS_PERSISTENT=TRUE chromadb/chroma:latest +``` + +2. Edit the `run.yaml` file to point to the chromadb server. +``` +memory: + - provider_id: remote::chromadb + provider_type: remote::chromadb + config: + host: localhost + port: 6000 +``` + +3. Run `docker compose up` with the updated `run.yaml` file. diff --git a/distributions/meta-reference-gpu/compose.yaml b/distributions/meta-reference-gpu/compose.yaml new file mode 100644 index 000000000..70b37f260 --- /dev/null +++ b/distributions/meta-reference-gpu/compose.yaml @@ -0,0 +1,35 @@ +services: + llamastack: + image: llamastack/distribution-meta-reference-gpu + network_mode: "host" + volumes: + - ~/.llama:/root/.llama + - ./run.yaml:/root/my-run.yaml + ports: + - "5000:5000" + devices: + - nvidia.com/gpu=all + environment: + - CUDA_VISIBLE_DEVICES=0 + command: [] + deploy: + resources: + reservations: + devices: + - driver: nvidia + # that's the closest analogue to --gpus; provide + # an integer amount of devices or 'all' + count: 1 + # Devices are reserved using a list of capabilities, making + # capabilities the only required field. A device MUST + # satisfy all the requested capabilities for a successful + # reservation. + capabilities: [gpu] + runtime: nvidia + entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" + deploy: + restart_policy: + condition: on-failure + delay: 3s + max_attempts: 5 + window: 60s diff --git a/distributions/meta-reference-gpu/run.yaml b/distributions/meta-reference-gpu/run.yaml index 724ca030a..eca64027e 100644 --- a/distributions/meta-reference-gpu/run.yaml +++ b/distributions/meta-reference-gpu/run.yaml @@ -33,9 +33,14 @@ providers: prompt_guard_shield: model: Prompt-Guard-86M memory: - - provider_id: meta0 - provider_type: meta-reference - config: {} + - provider_id: pgvector + provider_type: remote::pgvector + config: + host: 127.0.0.1 + port: 5432 + db: postgres + user: postgres + password: mysecretpassword agents: - provider_id: meta0 provider_type: meta-reference diff --git a/distributions/together/compose.yaml b/distributions/together/compose.yaml index 75c96b686..8d938990e 100644 --- a/distributions/together/compose.yaml +++ b/distributions/together/compose.yaml @@ -4,11 +4,9 @@ services: network_mode: "host" volumes: - ~/.llama:/root/.llama - # Link to ollama run.yaml file - ./run.yaml:/root/llamastack-run-together.yaml ports: - "5000:5000" - # Hack: wait for ollama server to start before starting docker entrypoint: bash -c "python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-together.yaml" deploy: restart_policy: