From 1ecaf2cb3c0d82c242214e5a58024ab5bd228adb Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Mon, 18 Nov 2024 10:57:20 -0800 Subject: [PATCH] Add ollama/pull-models.sh --- distributions/ollama/compose.yaml | 63 +++++- distributions/ollama/pull-models.sh | 18 ++ distributions/ollama/run-with-safety.yaml | 9 +- distributions/ollama/run.yaml | 6 +- .../remote-vllm/run-with-safety.yaml | 5 +- distributions/remote-vllm/run.yaml | 4 +- distributions/tgi/run-with-safety.yaml | 7 +- distributions/tgi/run.yaml | 4 +- .../self_hosted_distro/ollama.md | 27 +-- .../self_hosted_distro/remote-vllm.md | 29 +-- .../distributions/self_hosted_distro/tgi.md | 195 +++++++++--------- llama_stack/scripts/distro_codegen.py | 4 +- llama_stack/templates/ollama/ollama.py | 4 +- llama_stack/templates/template.py | 31 +-- llama_stack/templates/tgi/doc_template.md | 178 ++++++++-------- llama_stack/templates/tgi/tgi.py | 10 +- 16 files changed, 305 insertions(+), 289 deletions(-) create mode 100755 distributions/ollama/pull-models.sh diff --git a/distributions/ollama/compose.yaml b/distributions/ollama/compose.yaml index dc51d4759..176f19d6b 100644 --- a/distributions/ollama/compose.yaml +++ b/distributions/ollama/compose.yaml @@ -1,30 +1,71 @@ services: ollama: image: ollama/ollama:latest - network_mode: "host" + network_mode: ${NETWORK_MODE:-bridge} volumes: - - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast + - ~/.ollama:/root/.ollama ports: - "11434:11434" + environment: + OLLAMA_DEBUG: 1 command: [] + deploy: + resources: + limits: + memory: 8G # Set maximum memory + reservations: + memory: 8G # Set minimum memory reservation + # healthcheck: + # # ugh, no CURL in ollama image + # test: ["CMD", "curl", "-f", "http://ollama:11434"] + # interval: 10s + # timeout: 5s + # retries: 5 + + ollama-init: + image: ollama/ollama:latest + depends_on: + - ollama + # condition: service_healthy + network_mode: ${NETWORK_MODE:-bridge} + environment: + - OLLAMA_HOST=ollama + - INFERENCE_MODEL=${INFERENCE_MODEL} + - SAFETY_MODEL=${SAFETY_MODEL:-} + volumes: + - ~/.ollama:/root/.ollama + - ./pull-models.sh:/pull-models.sh + entrypoint: ["/pull-models.sh"] + llamastack: depends_on: - - ollama - image: llamastack/distribution-ollama - network_mode: "host" + ollama: + condition: service_started + ollama-init: + condition: service_started + image: ${LLAMA_STACK_IMAGE:-llamastack/distribution-ollama} + network_mode: ${NETWORK_MODE:-bridge} volumes: - ~/.llama:/root/.llama # Link to ollama run.yaml file - - ./run.yaml:/root/my-run.yaml + - ~/local/llama-stack/:/app/llama-stack-source + - ./run${SAFETY_MODEL:+-with-safety}.yaml:/root/my-run.yaml ports: - - "5000:5000" - # Hack: wait for ollama server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/my-run.yaml" + - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}" + environment: + - INFERENCE_MODEL=${INFERENCE_MODEL} + - SAFETY_MODEL=${SAFETY_MODEL:-} + - OLLAMA_URL=http://ollama:11434 + entrypoint: > + python -m llama_stack.distribution.server.server /root/my-run.yaml \ + --port ${LLAMA_STACK_PORT:-5001} deploy: restart_policy: condition: on-failure - delay: 3s - max_attempts: 5 + delay: 10s + max_attempts: 3 window: 60s volumes: ollama: + ollama-init: + llamastack: diff --git a/distributions/ollama/pull-models.sh b/distributions/ollama/pull-models.sh new file mode 100755 index 000000000..fb5bf8a4a --- /dev/null +++ b/distributions/ollama/pull-models.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +echo "Preloading (${INFERENCE_MODEL}, ${SAFETY_MODEL})..." +for model in ${INFERENCE_MODEL} ${SAFETY_MODEL}; do + echo "Preloading $model..." + if ! ollama run "$model"; then + echo "Failed to pull and run $model" + exit 1 + fi +done + +echo "All models pulled successfully" diff --git a/distributions/ollama/run-with-safety.yaml b/distributions/ollama/run-with-safety.yaml index 32eb30eea..d0f657377 100644 --- a/distributions/ollama/run-with-safety.yaml +++ b/distributions/ollama/run-with-safety.yaml @@ -1,13 +1,12 @@ version: '2' -built_at: 2024-11-17 19:33:00 image_name: ollama docker_image: null conda_env: null apis: -- memory - agents -- safety - inference +- memory +- safety - telemetry providers: inference: @@ -46,11 +45,11 @@ metadata_store: models: - metadata: {} model_id: ${env.INFERENCE_MODEL} - provider_id: ollama-inference + provider_id: ollama provider_model_id: null - metadata: {} model_id: ${env.SAFETY_MODEL} - provider_id: ollama-safety + provider_id: ollama provider_model_id: null shields: - params: null diff --git a/distributions/ollama/run.yaml b/distributions/ollama/run.yaml index 5d5e474e5..c4003006b 100644 --- a/distributions/ollama/run.yaml +++ b/distributions/ollama/run.yaml @@ -3,10 +3,10 @@ image_name: ollama docker_image: null conda_env: null apis: -- memory - agents -- safety - inference +- memory +- safety - telemetry providers: inference: @@ -45,7 +45,7 @@ metadata_store: models: - metadata: {} model_id: ${env.INFERENCE_MODEL} - provider_id: ollama-inference + provider_id: ollama provider_model_id: null shields: [] memory_banks: [] diff --git a/distributions/remote-vllm/run-with-safety.yaml b/distributions/remote-vllm/run-with-safety.yaml index 4f4cce415..94db5fe5d 100644 --- a/distributions/remote-vllm/run-with-safety.yaml +++ b/distributions/remote-vllm/run-with-safety.yaml @@ -1,13 +1,12 @@ version: '2' -built_at: 2024-11-17 19:33:00 image_name: remote-vllm docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 conda_env: null apis: -- memory - agents -- safety - inference +- memory +- safety - telemetry providers: inference: diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml index fbda1dae8..e99f41760 100644 --- a/distributions/remote-vllm/run.yaml +++ b/distributions/remote-vllm/run.yaml @@ -3,10 +3,10 @@ image_name: remote-vllm docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 conda_env: null apis: -- memory - agents -- safety - inference +- memory +- safety - telemetry providers: inference: diff --git a/distributions/tgi/run-with-safety.yaml b/distributions/tgi/run-with-safety.yaml index 35828a37a..b1f12cc88 100644 --- a/distributions/tgi/run-with-safety.yaml +++ b/distributions/tgi/run-with-safety.yaml @@ -1,13 +1,12 @@ version: '2' -built_at: 2024-11-17 19:33:00 image_name: tgi docker_image: llamastack/distribution-tgi:test-0.0.52rc3 conda_env: null apis: -- memory - agents -- safety - inference +- memory +- safety - telemetry providers: inference: @@ -18,7 +17,7 @@ providers: - provider_id: tgi-safety provider_type: remote::tgi config: - url: ${env.SAFETY_TGI_URL} + url: ${env.TGI_SAFETY_URL} memory: - provider_id: faiss provider_type: inline::faiss diff --git a/distributions/tgi/run.yaml b/distributions/tgi/run.yaml index ca462d6b1..5571beabd 100644 --- a/distributions/tgi/run.yaml +++ b/distributions/tgi/run.yaml @@ -3,10 +3,10 @@ image_name: tgi docker_image: llamastack/distribution-tgi:test-0.0.52rc3 conda_env: null apis: -- memory - agents -- safety - inference +- memory +- safety - telemetry providers: inference: diff --git a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md index 9f3757301..3db186f18 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md @@ -2,27 +2,16 @@ The `llamastack/distribution-ollama` distribution consists of the following provider configurations. - Provider Configuration -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -┃ API ┃ Provider(s) ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -│ agents │ `inline::meta-reference` │ -│ inference │ `remote::ollama` │ -│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` │ -│ safety │ `inline::llama-guard` │ -│ telemetry │ `inline::meta-reference` │ -└───────────┴─────────────────────────────────────────────────────────┘ +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::ollama` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | -You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables - -The following environment variables can be configured: - -- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) -- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`) -- `OLLAMA_URL`: URL of the Ollama server (default: `http://host.docker.internal:11434`) -- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) -### Models +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Models The following models are configured by default: - `${env.INFERENCE_MODEL}` diff --git a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md index 0ecfafaea..dd3684436 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md @@ -2,29 +2,16 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations: - Provider Configuration -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -┃ API ┃ Provider(s) ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -│ agents │ `inline::meta-reference` │ -│ inference │ `remote::vllm` │ -│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` │ -│ safety │ `inline::llama-guard` │ -│ telemetry │ `inline::meta-reference` │ -└───────────┴─────────────────────────────────────────────────────────┘ +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::vllm` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | -You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference.### Environment Variables - -The following environment variables can be configured: - -- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) -- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`) -- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100}/v1`) -- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`) -- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`) -- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) - +You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference. ### Models The following models are configured by default: diff --git a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md index bae0a19ac..fff8c1d08 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md @@ -2,128 +2,127 @@ The `llamastack/distribution-tgi` distribution consists of the following provider configurations. - Provider Configuration -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -┃ API ┃ Provider(s) ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -│ agents │ `inline::meta-reference` │ -│ inference │ `remote::tgi` │ -│ memory │ `inline::faiss`, `remote::chromadb`, `remote::pgvector` │ -│ safety │ `inline::llama-guard` │ -│ telemetry │ `inline::meta-reference` │ -└───────────┴─────────────────────────────────────────────────────────┘ +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::tgi` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | -You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference.### Environment Variables +You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference. + +### Environment Variables The following environment variables can be configured: - `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) - `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`) -- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://host.docker.internal:8080}/v1`) -- `SAFETY_TGI_URL`: URL of the TGI server with the safety model (default: `http://host.docker.internal:8081/v1`) +- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080}/v1`) +- `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`) - `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) -### Models - -The following models are configured by default: -- `${env.INFERENCE_MODEL}` -- `${env.SAFETY_MODEL}` -## Using Docker Compose +## Setting up TGI server -You can use `docker compose` to start a TGI container and Llama Stack server container together. +Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: ```bash -$ cd distributions/tgi; docker compose up +export TGI_INFERENCE_PORT=8080 +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export CUDA_VISIBLE_DEVICES=0 + +docker run --rm -it \ + -v $HOME/.cache/huggingface:/data \ + -p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \ + --gpus $CUDA_VISIBLE_DEVICES \ + ghcr.io/huggingface/text-generation-inference:2.3.1 \ + --dtype bfloat16 \ + --usage-stats off \ + --sharded false \ + --cuda-memory-fraction 0.7 \ + --model-id $INFERENCE_MODEL \ + --port $TGI_INFERENCE_PORT ``` -The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- -```bash -[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama) -[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0 -[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5001 (Press CTRL+C to quit) -``` - -To kill the server -```bash -docker compose down -``` - - -### Conda: TGI server + llama stack run - -If you wish to separately spin up a TGI server, and connect with Llama Stack, you may use the following commands. - -#### Start TGI server locally -- Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. +If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: ```bash -docker run --rm -it -v $HOME/.cache/huggingface:/data \ - -p 5009:5009 --gpus all \ - ghcr.io/huggingface/text-generation-inference:latest \ - --dtype bfloat16 --usage-stats on --sharded false \ - --model-id meta-llama/Llama-3.2-3B-Instruct --port 5009 +export TGI_SAFETY_PORT=8081 +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +export CUDA_VISIBLE_DEVICES=1 + +docker run --rm -it \ + -v $HOME/.cache/huggingface:/data \ + -p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \ + --gpus $CUDA_VISIBLE_DEVICES \ + ghcr.io/huggingface/text-generation-inference:2.3.1 \ + --dtype bfloat16 \ + --usage-stats off \ + --sharded false \ + --model-id $SAFETY_MODEL \ + --port $TGI_SAFETY_PORT ``` -#### Start Llama Stack server pointing to TGI server +## Running Llama Stack with TGI as the inference provider -**Via Conda** +Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + --network host \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-tgi \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +docker run \ + --network host \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-tgi \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT +``` + +### Via Conda + +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. ```bash llama stack build --template tgi --image-type conda -# -- start a TGI server endpoint -llama stack run ./gpu/run.yaml +llama stack run ./run.yaml + --port 5001 + --env INFERENCE_MODEL=$INFERENCE_MODEL + --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT ``` -**Via Docker** +If you are using Llama Stack Safety / Shield APIs, use: + ```bash -docker run --network host -it -p 5001:5001 \ - -v ./run.yaml:/root/my-run.yaml --gpus=all \ - llamastack/distribution-tgi \ - --yaml_config /root/my-run.yaml -``` - -We have provided a template `run.yaml` file in the `distributions/tgi` directory. Make sure in your `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g. -```yaml -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 -``` - - -### (Optional) Update Model Serving Configuration -To serve a new model with `tgi`, change the docker command flag `--model-id `. - -This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve. - -```yaml -command: > - --dtype bfloat16 --usage-stats on --sharded false - --model-id meta-llama/Llama-3.2-1B-Instruct - --port 5009 --cuda-memory-fraction 0.7 -``` - -or by changing the docker run command's `--model-id` flag -```bash -docker run --rm -it -v $HOME/.cache/huggingface:/data \ - -p 5009:5009 --gpus all \ - ghcr.io/huggingface/text-generation-inference:latest \ - --dtype bfloat16 --usage-stats off --sharded false \ - --model-id meta-llama/Llama-3.2-3B-Instruct --port 5009 -``` - -In `run.yaml`, make sure you point the correct server endpoint to the TGI server endpoint serving your model. -```yaml -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 +llama stack run ./run-with-safety.yaml + --port 5001 + --env INFERENCE_MODEL=$INFERENCE_MODEL + --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT + --env SAFETY_MODEL=$SAFETY_MODEL + --env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT ``` diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py index bff39c57d..2f41ec6ea 100644 --- a/llama_stack/scripts/distro_codegen.py +++ b/llama_stack/scripts/distro_codegen.py @@ -21,7 +21,9 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]: if not templates_dir.exists(): raise FileNotFoundError(f"Templates directory not found: {templates_dir}") - return (d for d in templates_dir.iterdir() if d.is_dir()) + return ( + d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__" + ) def process_template(template_dir: Path, progress) -> None: diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index deb254c80..d40b02a2c 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -28,11 +28,11 @@ def get_distribution_template() -> DistributionTemplate: inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", - provider_id="ollama-inference", + provider_id="ollama", ) safety_model = ModelInput( model_id="${env.SAFETY_MODEL}", - provider_id="ollama-safety", + provider_id="ollama", ) return DistributionTemplate( diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index 50d24b256..3048889a9 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -4,18 +4,13 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from io import StringIO - from pathlib import Path -from typing import Dict, List, Literal, Optional, Set, Tuple +from typing import Dict, List, Literal, Optional, Tuple import jinja2 import yaml from pydantic import BaseModel, Field -from rich.console import Console -from rich.table import Table - from llama_stack.distribution.datatypes import ( Api, BuildConfig, @@ -80,12 +75,12 @@ class RunConfigSettings(BaseModel): ] # Get unique set of APIs from providers - apis: Set[str] = set(providers.keys()) + apis = list(sorted(providers.keys())) return StackRunConfig( image_name=name, docker_image=docker_image, - apis=list(apis), + apis=apis, providers=provider_configs, metadata_store=SqliteKVStoreConfig.sample_run_config( __distro_dir__=f"distributions/{name}", @@ -111,7 +106,7 @@ class DistributionTemplate(BaseModel): template_path: Path # Optional configuration - docker_compose_env_vars: Optional[Dict[str, Tuple[str, str]]] = None + run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None docker_image: Optional[str] = None default_models: Optional[List[ModelInput]] = None @@ -128,20 +123,12 @@ class DistributionTemplate(BaseModel): ) def generate_markdown_docs(self) -> str: - """Generate markdown documentation using both Jinja2 templates and rich tables.""" - # First generate the providers table using rich - output = StringIO() - console = Console(file=output, force_terminal=False) - - table = Table(title="Provider Configuration", show_header=True) - table.add_column("API", style="bold") - table.add_column("Provider(s)") + providers_table = "| API | Provider(s) |\n" + providers_table += "|-----|-------------|\n" for api, providers in sorted(self.providers.items()): - table.add_row(api, ", ".join(f"`{p}`" for p in providers)) - - console.print(table) - providers_table = output.getvalue() + providers_str = ", ".join(f"`{p}`" for p in providers) + providers_table += f"| {api} | {providers_str} |\n" template = self.template_path.read_text() # Render template with rich-generated table @@ -152,7 +139,7 @@ class DistributionTemplate(BaseModel): description=self.description, providers=self.providers, providers_table=providers_table, - docker_compose_env_vars=self.docker_compose_env_vars, + run_config_env_vars=self.run_config_env_vars, default_models=self.default_models, ) diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md index 95963861a..9b22b3b37 100644 --- a/llama_stack/templates/tgi/doc_template.md +++ b/llama_stack/templates/tgi/doc_template.md @@ -6,120 +6,116 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference. -{%- if docker_compose_env_vars %} +{% if run_config_env_vars %} ### Environment Variables The following environment variables can be configured: -{% for var, (default_value, description) in docker_compose_env_vars.items() %} +{% for var, (default_value, description) in run_config_env_vars.items() %} - `{{ var }}`: {{ description }} (default: `{{ default_value }}`) {% endfor %} {% endif %} -{%- if default_models %} -### Models -The following models are configured by default: -{% for model in default_models %} -- `{{ model.model_id }}` -{% endfor %} -{% endif %} +## Setting up TGI server - -## Using Docker Compose - -You can use `docker compose` to start a TGI container and Llama Stack server container together. +Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: ```bash -$ cd distributions/{{ name }}; docker compose up +export TGI_INFERENCE_PORT=8080 +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export CUDA_VISIBLE_DEVICES=0 + +docker run --rm -it \ + -v $HOME/.cache/huggingface:/data \ + -p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \ + --gpus $CUDA_VISIBLE_DEVICES \ + ghcr.io/huggingface/text-generation-inference:2.3.1 \ + --dtype bfloat16 \ + --usage-stats off \ + --sharded false \ + --cuda-memory-fraction 0.7 \ + --model-id $INFERENCE_MODEL \ + --port $TGI_INFERENCE_PORT ``` -The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- -```bash -[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama) -[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0 -[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5001 (Press CTRL+C to quit) -``` - -To kill the server -```bash -docker compose down -``` - - -### Conda: TGI server + llama stack run - -If you wish to separately spin up a TGI server, and connect with Llama Stack, you may use the following commands. - -#### Start TGI server locally -- Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. +If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: ```bash -docker run --rm -it -v $HOME/.cache/huggingface:/data \ - -p 5009:5009 --gpus all \ - ghcr.io/huggingface/text-generation-inference:latest \ - --dtype bfloat16 --usage-stats on --sharded false \ - --model-id meta-llama/Llama-3.2-3B-Instruct --port 5009 +export TGI_SAFETY_PORT=8081 +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +export CUDA_VISIBLE_DEVICES=1 + +docker run --rm -it \ + -v $HOME/.cache/huggingface:/data \ + -p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \ + --gpus $CUDA_VISIBLE_DEVICES \ + ghcr.io/huggingface/text-generation-inference:2.3.1 \ + --dtype bfloat16 \ + --usage-stats off \ + --sharded false \ + --model-id $SAFETY_MODEL \ + --port $TGI_SAFETY_PORT ``` -#### Start Llama Stack server pointing to TGI server +## Running Llama Stack with TGI as the inference provider -**Via Conda** +Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + --network host \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +docker run \ + --network host \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT +``` + +### Via Conda + +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. ```bash llama stack build --template {{ name }} --image-type conda -# -- start a TGI server endpoint -llama stack run ./gpu/run.yaml +llama stack run ./run.yaml + --port 5001 + --env INFERENCE_MODEL=$INFERENCE_MODEL + --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT ``` -**Via Docker** +If you are using Llama Stack Safety / Shield APIs, use: + ```bash -docker run --network host -it -p 5001:5001 \ - -v ./run.yaml:/root/my-run.yaml --gpus=all \ - llamastack/distribution-{{ name }} \ - --yaml_config /root/my-run.yaml -``` - -We have provided a template `run.yaml` file in the `distributions/{{ name }}` directory. Make sure in your `run.yaml` file, you inference provider is pointing to the correct TGI server endpoint. E.g. -```yaml -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 -``` - - -### (Optional) Update Model Serving Configuration -To serve a new model with `tgi`, change the docker command flag `--model-id `. - -This can be done by edit the `command` args in `compose.yaml`. E.g. Replace "Llama-3.2-1B-Instruct" with the model you want to serve. - -```yaml -command: > - --dtype bfloat16 --usage-stats on --sharded false - --model-id meta-llama/Llama-3.2-1B-Instruct - --port 5009 --cuda-memory-fraction 0.7 -``` - -or by changing the docker run command's `--model-id` flag -```bash -docker run --rm -it -v $HOME/.cache/huggingface:/data \ - -p 5009:5009 --gpus all \ - ghcr.io/huggingface/text-generation-inference:latest \ - --dtype bfloat16 --usage-stats off --sharded false \ - --model-id meta-llama/Llama-3.2-3B-Instruct --port 5009 -``` - -In `run.yaml`, make sure you point the correct server endpoint to the TGI server endpoint serving your model. -```yaml -inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: http://127.0.0.1:5009 +llama stack run ./run-with-safety.yaml + --port 5001 + --env INFERENCE_MODEL=$INFERENCE_MODEL + --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT + --env SAFETY_MODEL=$SAFETY_MODEL + --env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT ``` diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index cee848fc8..79f2ad395 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -60,7 +60,7 @@ def get_distribution_template() -> DistributionTemplate: provider_id="tgi-safety", provider_type="remote::tgi", config=TGIImplConfig.sample_run_config( - url="${env.SAFETY_TGI_URL}", + url="${env.TGI_SAFETY_URL}", ), ), ], @@ -72,7 +72,7 @@ def get_distribution_template() -> DistributionTemplate: default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], ), }, - docker_compose_env_vars={ + run_config_env_vars={ "LLAMASTACK_PORT": ( "5001", "Port for the Llama Stack distribution server", @@ -82,11 +82,11 @@ def get_distribution_template() -> DistributionTemplate: "Inference model loaded into the TGI server", ), "TGI_URL": ( - "http://host.docker.internal:8080}/v1", + "http://127.0.0.1:8080}/v1", "URL of the TGI server with the main inference model", ), - "SAFETY_TGI_URL": ( - "http://host.docker.internal:8081/v1", + "TGI_SAFETY_URL": ( + "http://127.0.0.1:8081/v1", "URL of the TGI server with the safety model", ), "SAFETY_MODEL": (