diff --git a/distributions/fireworks/run.yaml b/distributions/fireworks/run.yaml index f289cee72..8d3316257 100644 --- a/distributions/fireworks/run.yaml +++ b/distributions/fireworks/run.yaml @@ -1,50 +1,91 @@ version: '2' -image_name: local +image_name: fireworks docker_image: null -conda_env: local +conda_env: null apis: -- shields - agents -- models -- memory -- memory_banks - inference +- memory - safety +- telemetry providers: inference: - - provider_id: fireworks0 + - provider_id: fireworks provider_type: remote::fireworks config: url: https://api.fireworks.ai/inference - # api_key: - safety: - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M + api_key: ${env.FIREWORKS_API_KEY} memory: - - provider_id: meta0 - provider_type: inline::meta-reference + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard config: {} - # Uncomment to use weaviate memory provider - # - provider_id: weaviate0 - # provider_type: remote::weaviate - # config: {} agents: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: persistence_store: - namespace: null type: sqlite - db_path: ~/.llama/runtime/kvstore.db + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/agents_store.db telemetry: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/registry.db +models: +- metadata: {} + model_id: fireworks/llama-v3p1-8b-instruct + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-v3p1-70b-instruct + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-v3p1-405b-instruct + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-v3p2-1b-instruct + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-v3p2-3b-instruct + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-v3p2-11b-vision-instruct + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-v3p2-90b-vision-instruct + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-guard-3-8b + provider_id: null + provider_model_id: null +- metadata: {} + model_id: fireworks/llama-guard-3-11b-vision + provider_id: null + provider_model_id: null +shields: +- params: null + shield_id: meta-llama/Llama-Guard-3-8B + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/distributions/remote-vllm/run-with-safety.yaml b/distributions/remote-vllm/run-with-safety.yaml index 94db5fe5d..075cd793f 100644 --- a/distributions/remote-vllm/run-with-safety.yaml +++ b/distributions/remote-vllm/run-with-safety.yaml @@ -1,6 +1,6 @@ version: '2' image_name: remote-vllm -docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 +docker_image: null conda_env: null apis: - agents diff --git a/distributions/remote-vllm/run.yaml b/distributions/remote-vllm/run.yaml index e99f41760..da45acee2 100644 --- a/distributions/remote-vllm/run.yaml +++ b/distributions/remote-vllm/run.yaml @@ -1,6 +1,6 @@ version: '2' image_name: remote-vllm -docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 +docker_image: null conda_env: null apis: - agents diff --git a/distributions/together/run.yaml b/distributions/together/run.yaml index de7ae3c53..cc3c890f4 100644 --- a/distributions/together/run.yaml +++ b/distributions/together/run.yaml @@ -1,45 +1,87 @@ version: '2' -image_name: local +image_name: together docker_image: null -conda_env: local +conda_env: null apis: -- shields - agents -- models -- memory -- memory_banks - inference +- memory - safety +- telemetry providers: inference: - - provider_id: together0 + - provider_id: together provider_type: remote::together config: url: https://api.together.xyz/v1 - # api_key: - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - model: Llama-Guard-3-1B - excluded_categories: [] - - provider_id: meta1 - provider_type: inline::prompt-guard - config: - model: Prompt-Guard-86M + api_key: ${env.TOGETHER_API_KEY} memory: - - provider_id: meta0 - provider_type: remote::weaviate + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard config: {} agents: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: persistence_store: - namespace: null type: sqlite - db_path: ~/.llama/runtime/kvstore.db + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/agents_store.db telemetry: - - provider_id: meta0 + - provider_id: meta-reference provider_type: inline::meta-reference config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/registry.db +models: +- metadata: {} + model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + provider_id: null + provider_model_id: null +- metadata: {} + model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + provider_id: null + provider_model_id: null +- metadata: {} + model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + provider_id: null + provider_model_id: null +- metadata: {} + model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo + provider_id: null + provider_model_id: null +- metadata: {} + model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + provider_id: null + provider_model_id: null +- metadata: {} + model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + provider_id: null + provider_model_id: null +- metadata: {} + model_id: meta-llama/Meta-Llama-Guard-3-8B + provider_id: null + provider_model_id: null +- metadata: {} + model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo + provider_id: null + provider_model_id: null +shields: +- params: null + shield_id: meta-llama/Llama-Guard-3-1B + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md index ee46cd18d..03ee9e604 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/fireworks.md @@ -2,63 +2,67 @@ The `llamastack/distribution-fireworks` distribution consists of the following provider configurations. +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::fireworks` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | -| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | -|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- | -| **Provider(s)** | remote::fireworks | meta-reference | meta-reference | meta-reference | meta-reference | -### Step 0. Prerequisite -- Make sure you have access to a fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/) +### Environment Variables -### Step 1. Start the Distribution (Single Node CPU) +The following environment variables can be configured: -#### (Option 1) Start Distribution Via Docker -> [!NOTE] -> This assumes you have an hosted endpoint at Fireworks with API Key. +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``) -``` -$ cd distributions/fireworks && docker compose up +### Models + +The following models are available by default: + +- `fireworks/llama-v3p1-8b-instruct` +- `fireworks/llama-v3p1-70b-instruct` +- `fireworks/llama-v3p1-405b-instruct` +- `fireworks/llama-v3p2-1b-instruct` +- `fireworks/llama-v3p2-3b-instruct` +- `fireworks/llama-v3p2-11b-vision-instruct` +- `fireworks/llama-v3p2-90b-vision-instruct` +- `fireworks/llama-guard-3-8b` +- `fireworks/llama-guard-3-11b-vision` + + +### Prerequisite: API Keys + +Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/). + + +## Running Llama Stack with Fireworks + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-fireworks \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY ``` -Make sure in you `run.yaml` file, you inference provider is pointing to the correct Fireworks URL server endpoint. E.g. -``` -inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference - api_key: -``` - -#### (Option 2) Start Distribution Via Conda +### Via Conda ```bash llama stack build --template fireworks --image-type conda -# -- modify run.yaml to a valid Fireworks server endpoint -llama stack run ./run.yaml -``` - - -### (Optional) Model Serving - -Use `llama-stack-client models list` to check the available models served by Fireworks. -``` -$ llama-stack-client models list -+------------------------------+------------------------------+---------------+------------+ -| identifier | llama_model | provider_id | metadata | -+==============================+==============================+===============+============+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-1B-Instruct | Llama3.2-1B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | fireworks0 | {} | -+------------------------------+------------------------------+---------------+------------+ +llama stack run ./run.yaml \ + --port 5001 \ + --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY ``` diff --git a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md index 3db186f18..0acee3198 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/ollama.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/ollama.md @@ -11,90 +11,97 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | telemetry | `inline::meta-reference` | -You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Models +You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. -The following models are configured by default: -- `${env.INFERENCE_MODEL}` -- `${env.SAFETY_MODEL}` +## Setting up Ollama server -## Using Docker Compose +Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server. -You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command. +In order to load models, you can run: ```bash -$ cd distributions/ollama; docker compose up +export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" + +# ollama names this model differently, and we must use the ollama name when loading the model +export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16" +ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m ``` -You will see outputs similar to following --- +If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model. + ```bash -[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps" -[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps" -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -[llamastack] | Resolved 12 providers -[llamastack] | inner-inference => ollama0 -[llamastack] | models => __routing_table__ -[llamastack] | inference => __autorouted__ +export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" + +# ollama names this model differently, and we must use the ollama name when loading the model +export OLLAMA_SAFETY_MODEL="llama-guard3:1b" +ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m ``` -To kill the server +## Running Llama Stack + +Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + ```bash -docker compose down +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + -v ./run.yaml:/root/my-run.yaml \ + --gpus=all \ + llamastack/distribution-ollama \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env OLLAMA_URL=http://host.docker.internal:11434 ``` -## Starting Ollama and Llama Stack separately +If you are using Llama Stack Safety / Shield APIs, use: -If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands. - -#### Start Ollama server -- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details. - -**Via Docker** ```bash -docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + --gpus=all \ + llamastack/distribution-ollama \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env OLLAMA_URL=http://host.docker.internal:11434 ``` -**Via CLI** -```bash -ollama run -``` +### Via Conda -#### Start Llama Stack server pointing to Ollama server - -**Via Conda** +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. ```bash llama stack build --template ollama --image-type conda -llama stack run run.yaml +llama stack run ./run.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env OLLAMA_URL=http://127.0.0.1:11434 ``` -**Via Docker** -``` -docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml -``` - -Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g. -```yaml -inference: - - provider_id: ollama0 - provider_type: remote::ollama - config: - url: http://127.0.0.1:14343 -``` - -### (Optional) Update Model Serving Configuration - -#### Downloading model via Ollama - -You can use ollama for managing model downloads. +If you are using Llama Stack Safety / Shield APIs, use: ```bash -ollama pull llama3.1:8b-instruct-fp16 -ollama pull llama3.1:70b-instruct-fp16 +llama stack run ./run-with-safety.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env OLLAMA_URL=http://127.0.0.1:11434 ``` + +### (Optional) Update Model Serving Configuration + > [!NOTE] > Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. diff --git a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md index dd3684436..c9f8d6167 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/remote-vllm.md @@ -12,77 +12,106 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference. -### Models -The following models are configured by default: -- `${env.INFERENCE_MODEL}` -- `${env.SAFETY_MODEL}` -## Using Docker Compose -You can use `docker compose` to start a vLLM container and Llama Stack server container together. -```bash -$ cd distributions/remote-vllm; docker compose up -``` +## Setting up vLLM server -You will see outputs similar to following --- -``` - -``` - -To kill the server -```bash -docker compose down -``` - -## Starting vLLM and Llama Stack separately - -You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack. - -#### Start vLLM server. +Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker: ```bash -docker run --runtime nvidia --gpus all \ +export INFERENCE_PORT=8000 +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export CUDA_VISIBLE_DEVICES=0 + +docker run \ + --runtime nvidia \ + --gpus $CUDA_VISIBLE_DEVICES \ -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ --ipc=host \ vllm/vllm-openai:latest \ - --model meta-llama/Llama-3.2-3B-Instruct + --model $INFERENCE_MODEL \ + --port $INFERENCE_PORT ``` -Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details. +If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -#### Start Llama Stack server pointing to your vLLM server - - -We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following: -```yaml -inference: - - provider_id: vllm0 - provider_type: remote::vllm - config: - url: http://127.0.0.1:8000 -``` - -**Via Conda** - -If you are using Conda, you can build and run the Llama Stack server with the following commands: ```bash -cd distributions/remote-vllm -llama stack build --template remote-vllm --image-type conda -llama stack run run.yaml +export SAFETY_PORT=8081 +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +export CUDA_VISIBLE_DEVICES=1 + +docker run \ + --runtime nvidia \ + --gpus $CUDA_VISIBLE_DEVICES \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -p $SAFETY_PORT:$SAFETY_PORT \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model $SAFETY_MODEL \ + --port $SAFETY_PORT ``` -**Via Docker** +## Running Llama Stack + +Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. -You can use the Llama Stack Docker image to start the server with the following command: ```bash -docker run --network host -it -p 5000:5000 \ - -v ~/.llama:/root/.llama \ - -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \ - --gpus=all \ +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-remote-vllm \ - --yaml_config /root/llamastack-run-remote-vllm.yaml + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \ +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-remote-vllm \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env VLLM_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT +``` + + +### Via Conda + +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. + +```bash +llama stack build --template remote-vllm --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +llama stack run ./run-with-safety.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env VLLM_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT ``` diff --git a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md index fff8c1d08..7f84833f3 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/tgi.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/tgi.md @@ -29,13 +29,13 @@ The following environment variables can be configured: Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: ```bash -export TGI_INFERENCE_PORT=8080 +export INFERENCE_PORT=8080 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export CUDA_VISIBLE_DEVICES=0 docker run --rm -it \ -v $HOME/.cache/huggingface:/data \ - -p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ --gpus $CUDA_VISIBLE_DEVICES \ ghcr.io/huggingface/text-generation-inference:2.3.1 \ --dtype bfloat16 \ @@ -43,29 +43,29 @@ docker run --rm -it \ --sharded false \ --cuda-memory-fraction 0.7 \ --model-id $INFERENCE_MODEL \ - --port $TGI_INFERENCE_PORT + --port $INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: ```bash -export TGI_SAFETY_PORT=8081 +export SAFETY_PORT=8081 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B export CUDA_VISIBLE_DEVICES=1 docker run --rm -it \ -v $HOME/.cache/huggingface:/data \ - -p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \ + -p $SAFETY_PORT:$SAFETY_PORT \ --gpus $CUDA_VISIBLE_DEVICES \ ghcr.io/huggingface/text-generation-inference:2.3.1 \ --dtype bfloat16 \ --usage-stats off \ --sharded false \ --model-id $SAFETY_MODEL \ - --port $TGI_SAFETY_PORT + --port $SAFETY_PORT ``` -## Running Llama Stack with TGI as the inference provider +## Running Llama Stack Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. @@ -76,7 +76,6 @@ This method allows you to get started quickly without having to build the distri ```bash LLAMA_STACK_PORT=5001 docker run \ - --network host \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ./run.yaml:/root/my-run.yaml \ @@ -84,14 +83,13 @@ docker run \ /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT + --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, use: ```bash docker run \ - --network host \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ./run-with-safety.yaml:/root/my-run.yaml \ @@ -99,9 +97,9 @@ docker run \ /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \ + --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \ --env SAFETY_MODEL=$SAFETY_MODEL \ - --env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT + --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT ``` ### Via Conda @@ -113,7 +111,7 @@ llama stack build --template tgi --image-type conda llama stack run ./run.yaml --port 5001 --env INFERENCE_MODEL=$INFERENCE_MODEL - --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT + --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -122,7 +120,7 @@ If you are using Llama Stack Safety / Shield APIs, use: llama stack run ./run-with-safety.yaml --port 5001 --env INFERENCE_MODEL=$INFERENCE_MODEL - --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT + --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT --env SAFETY_MODEL=$SAFETY_MODEL - --env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT + --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT ``` diff --git a/docs/source/getting_started/distributions/self_hosted_distro/together.md b/docs/source/getting_started/distributions/self_hosted_distro/together.md index b9ea9f6e6..17f109e65 100644 --- a/docs/source/getting_started/distributions/self_hosted_distro/together.md +++ b/docs/source/getting_started/distributions/self_hosted_distro/together.md @@ -1,62 +1,67 @@ -# Together Distribution - -### Connect to a Llama Stack Together Endpoint -- You may connect to a hosted endpoint `https://llama-stack.together.ai`, serving a Llama Stack distribution +# Fireworks Distribution The `llamastack/distribution-together` distribution consists of the following provider configurations. - -| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | -|----------------- |--------------- |---------------- |-------------------------------------------------- |---------------- |---------------- | -| **Provider(s)** | remote::together | meta-reference | meta-reference, remote::weaviate | meta-reference | meta-reference | +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::together` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `inline::llama-guard` | +| telemetry | `inline::meta-reference` | -### Docker: Start the Distribution (Single Node CPU) +### Environment Variables -> [!NOTE] -> This assumes you have an hosted endpoint at Together with API Key. +The following environment variables can be configured: -``` -$ cd distributions/together && docker compose up +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) +- `TOGETHER_API_KEY`: Together.AI API Key (default: ``) + +### Models + +The following models are available by default: + +- `meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo` +- `meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo` +- `meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo` +- `meta-llama/Llama-3.2-3B-Instruct-Turbo` +- `meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo` +- `meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo` +- `meta-llama/Meta-Llama-Guard-3-8B` +- `meta-llama/Llama-Guard-3-11B-Vision-Turbo` + + +### Prerequisite: API Keys + +Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/). + + +## Running Llama Stack with Together + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-together \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env TOGETHER_API_KEY=$TOGETHER_API_KEY ``` -Make sure in your `run.yaml` file, your inference provider is pointing to the correct Together URL server endpoint. E.g. -``` -inference: - - provider_id: together - provider_type: remote::together - config: - url: https://api.together.xyz/v1 - api_key: -``` - -### Conda llama stack run (Single Node CPU) +### Via Conda ```bash llama stack build --template together --image-type conda -# -- modify run.yaml to a valid Together server endpoint -llama stack run ./run.yaml -``` - -### (Optional) Update Model Serving Configuration - -Use `llama-stack-client models list` to check the available models served by together. - -``` -$ llama-stack-client models list -+------------------------------+------------------------------+---------------+------------+ -| identifier | llama_model | provider_id | metadata | -+==============================+==============================+===============+============+ -| Llama3.1-8B-Instruct | Llama3.1-8B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-70B-Instruct | Llama3.1-70B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-405B-Instruct | Llama3.1-405B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-3B-Instruct | Llama3.2-3B-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-11B-Vision-Instruct | Llama3.2-11B-Vision-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.2-90B-Vision-Instruct | Llama3.2-90B-Vision-Instruct | together0 | {} | -+------------------------------+------------------------------+---------------+------------+ +llama stack run ./run.yaml \ + --port 5001 \ + --env TOGETHER_API_KEY=$TOGETHER_API_KEY ``` diff --git a/llama_stack/providers/remote/inference/fireworks/config.py b/llama_stack/providers/remote/inference/fireworks/config.py index 275ce99e7..062c1e1ea 100644 --- a/llama_stack/providers/remote/inference/fireworks/config.py +++ b/llama_stack/providers/remote/inference/fireworks/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, Optional from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field @@ -20,3 +20,10 @@ class FireworksImplConfig(BaseModel): default=None, description="The Fireworks.ai API Key", ) + + @classmethod + def sample_run_config(cls) -> Dict[str, Any]: + return { + "url": "https://api.fireworks.ai/inference", + "api_key": "${env.FIREWORKS_API_KEY}", + } diff --git a/llama_stack/providers/remote/inference/fireworks/fireworks.py b/llama_stack/providers/remote/inference/fireworks/fireworks.py index 42075eff7..3ff50d378 100644 --- a/llama_stack/providers/remote/inference/fireworks/fireworks.py +++ b/llama_stack/providers/remote/inference/fireworks/fireworks.py @@ -35,7 +35,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( from .config import FireworksImplConfig -model_aliases = [ +MODEL_ALIASES = [ build_model_alias( "fireworks/llama-v3p1-8b-instruct", CoreModelId.llama3_1_8b_instruct.value, @@ -79,7 +79,7 @@ class FireworksInferenceAdapter( ModelRegistryHelper, Inference, NeedsRequestProviderData ): def __init__(self, config: FireworksImplConfig) -> None: - ModelRegistryHelper.__init__(self, model_aliases) + ModelRegistryHelper.__init__(self, MODEL_ALIASES) self.config = config self.formatter = ChatFormat(Tokenizer.get_instance()) diff --git a/llama_stack/providers/remote/inference/together/config.py b/llama_stack/providers/remote/inference/together/config.py index e928a771d..11944c0c7 100644 --- a/llama_stack/providers/remote/inference/together/config.py +++ b/llama_stack/providers/remote/inference/together/config.py @@ -4,7 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from typing import Optional +from typing import Any, Dict, Optional from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field @@ -20,3 +20,10 @@ class TogetherImplConfig(BaseModel): default=None, description="The Together AI API Key", ) + + @classmethod + def sample_run_config(cls) -> Dict[str, Any]: + return { + "url": "https://api.together.xyz/v1", + "api_key": "${env.TOGETHER_API_KEY}", + } diff --git a/llama_stack/providers/remote/inference/together/together.py b/llama_stack/providers/remote/inference/together/together.py index aae34bb87..e7c96ce98 100644 --- a/llama_stack/providers/remote/inference/together/together.py +++ b/llama_stack/providers/remote/inference/together/together.py @@ -38,7 +38,7 @@ from llama_stack.providers.utils.inference.prompt_adapter import ( from .config import TogetherImplConfig -model_aliases = [ +MODEL_ALIASES = [ build_model_alias( "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", CoreModelId.llama3_1_8b_instruct.value, @@ -78,7 +78,7 @@ class TogetherInferenceAdapter( ModelRegistryHelper, Inference, NeedsRequestProviderData ): def __init__(self, config: TogetherImplConfig) -> None: - ModelRegistryHelper.__init__(self, model_aliases) + ModelRegistryHelper.__init__(self, MODEL_ALIASES) self.config = config self.formatter = ChatFormat(Tokenizer.get_instance()) diff --git a/llama_stack/templates/fireworks/__init__.py b/llama_stack/templates/fireworks/__init__.py new file mode 100644 index 000000000..1d85c66db --- /dev/null +++ b/llama_stack/templates/fireworks/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .fireworks import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml index ffd67738d..c16e3f5d6 100644 --- a/llama_stack/templates/fireworks/build.yaml +++ b/llama_stack/templates/fireworks/build.yaml @@ -1,11 +1,19 @@ +version: '2' name: fireworks distribution_spec: - description: Use Fireworks.ai for running LLM inference + description: Use Fireworks.AI for running LLM inference + docker_image: null providers: - inference: remote::fireworks + inference: + - remote::fireworks memory: - inline::faiss - - remote::weaviate - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/fireworks/doc_template.md b/llama_stack/templates/fireworks/doc_template.md new file mode 100644 index 000000000..bd25edfc1 --- /dev/null +++ b/llama_stack/templates/fireworks/doc_template.md @@ -0,0 +1,60 @@ +# Fireworks Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are available by default: + +{% for model in default_models %} +- `{{ model.model_id }}` +{% endfor %} +{% endif %} + + +### Prerequisite: API Keys + +Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/). + + +## Running Llama Stack with Fireworks + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY +``` + +### Via Conda + +```bash +llama stack build --template fireworks --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY +``` diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py new file mode 100644 index 000000000..c4d2fdac8 --- /dev/null +++ b/llama_stack/templates/fireworks/fireworks.py @@ -0,0 +1,60 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig +from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES + +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::fireworks"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="fireworks", + provider_type="remote::fireworks", + config=FireworksImplConfig.sample_run_config(), + ) + + default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES] + + return DistributionTemplate( + name="fireworks", + distro_type="self_hosted", + description="Use Fireworks.AI for running LLM inference", + docker_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=default_models, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=default_models, + default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "FIREWORKS_API_KEY": ( + "", + "Fireworks.AI API Key", + ), + }, + ) diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md index 2121a4fd6..11a15c9e9 100644 --- a/llama_stack/templates/ollama/doc_template.md +++ b/llama_stack/templates/ollama/doc_template.md @@ -6,103 +6,106 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. -{%- if docker_compose_env_vars %} +{%- if run_config_env_vars %} ### Environment Variables The following environment variables can be configured: -{% for var, (default_value, description) in docker_compose_env_vars.items() %} +{% for var, (default_value, description) in run_config_env_vars.items() %} - `{{ var }}`: {{ description }} (default: `{{ default_value }}`) {% endfor %} {% endif %} -{%- if default_models %} -### Models -The following models are configured by default: -{% for model in default_models %} -- `{{ model.model_id }}` -{% endfor %} -{% endif %} +## Setting up Ollama server -## Using Docker Compose +Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server. -You can use `docker compose` to start a Ollama server and connect with Llama Stack server in a single command. +In order to load models, you can run: ```bash -$ cd distributions/{{ name }}; docker compose up +export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" + +# ollama names this model differently, and we must use the ollama name when loading the model +export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16" +ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m ``` -You will see outputs similar to following --- +If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model. + ```bash -[ollama] | [GIN] 2024/10/18 - 21:19:41 | 200 | 226.841µs | ::1 | GET "/api/ps" -[ollama] | [GIN] 2024/10/18 - 21:19:42 | 200 | 60.908µs | ::1 | GET "/api/ps" -INFO: Started server process [1] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) -[llamastack] | Resolved 12 providers -[llamastack] | inner-inference => ollama0 -[llamastack] | models => __routing_table__ -[llamastack] | inference => __autorouted__ +export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" + +# ollama names this model differently, and we must use the ollama name when loading the model +export OLLAMA_SAFETY_MODEL="llama-guard3:1b" +ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m ``` -To kill the server +## Running Llama Stack + +Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + ```bash -docker compose down +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + -v ./run.yaml:/root/my-run.yaml \ + --gpus=all \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env OLLAMA_URL=http://host.docker.internal:11434 ``` -## Starting Ollama and Llama Stack separately +If you are using Llama Stack Safety / Shield APIs, use: -If you wish to separately spin up a Ollama server, and connect with Llama Stack, you should use the following commands. - -#### Start Ollama server -- Please check the [Ollama Documentation](https://github.com/ollama/ollama) for more details. - -**Via Docker** ```bash -docker run -d -v ollama:/root/.ollama -p 11434:11434 --name ollama ollama/ollama +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ~/.llama:/root/.llama \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + --gpus=all \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env OLLAMA_URL=http://host.docker.internal:11434 ``` -**Via CLI** -```bash -ollama run -``` +### Via Conda -#### Start Llama Stack server pointing to Ollama server - -**Via Conda** +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. ```bash llama stack build --template ollama --image-type conda -llama stack run run.yaml +llama stack run ./run.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env OLLAMA_URL=http://127.0.0.1:11434 ``` -**Via Docker** -``` -docker run --network host -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./gpu/run.yaml:/root/llamastack-run-ollama.yaml --gpus=all llamastack/distribution-ollama --yaml_config /root/llamastack-run-ollama.yaml -``` - -Make sure in your `run.yaml` file, your inference provider is pointing to the correct Ollama endpoint. E.g. -```yaml -inference: - - provider_id: ollama0 - provider_type: remote::ollama - config: - url: http://127.0.0.1:14343 -``` - -### (Optional) Update Model Serving Configuration - -#### Downloading model via Ollama - -You can use ollama for managing model downloads. +If you are using Llama Stack Safety / Shield APIs, use: ```bash -ollama pull llama3.1:8b-instruct-fp16 -ollama pull llama3.1:70b-instruct-fp16 +llama stack run ./run-with-safety.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env OLLAMA_URL=http://127.0.0.1:11434 ``` + +### (Optional) Update Model Serving Configuration + > [!NOTE] > Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index d40b02a2c..6e0056a77 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -68,17 +68,17 @@ def get_distribution_template() -> DistributionTemplate: "5001", "Port for the Llama Stack distribution server", ), + "OLLAMA_URL": ( + "http://127.0.0.1:11434", + "URL of the Ollama server", + ), "INFERENCE_MODEL": ( "meta-llama/Llama-3.2-3B-Instruct", - "Inference model loaded into the TGI server", - ), - "OLLAMA_URL": ( - "http://host.docker.internal:11434", - "URL of the Ollama server", + "Inference model loaded into the Ollama server", ), "SAFETY_MODEL": ( "meta-llama/Llama-Guard-3-1B", - "Name of the safety (Llama-Guard) model to use", + "Safety model loaded into the Ollama server", ), }, ) diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml index 1efa5dc7a..9f4597cb0 100644 --- a/llama_stack/templates/remote-vllm/build.yaml +++ b/llama_stack/templates/remote-vllm/build.yaml @@ -2,7 +2,7 @@ version: '2' name: remote-vllm distribution_spec: description: Use (an external) vLLM server for running LLM inference - docker_image: llamastack/distribution-remote-vllm:test-0.0.52rc3 + docker_image: null providers: inference: - remote::vllm diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md index 1045f6d15..c6ed53246 100644 --- a/llama_stack/templates/remote-vllm/doc_template.md +++ b/llama_stack/templates/remote-vllm/doc_template.md @@ -6,90 +6,114 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following You can use this distribution if you have GPUs and want to run an independent vLLM server container for running inference. -{%- if docker_compose_env_vars %} +{% if run_config_env_vars %} ### Environment Variables The following environment variables can be configured: -{% for var, (default_value, description) in docker_compose_env_vars.items() %} +{% for var, (default_value, description) in run_config_env_vars.items() %} - `{{ var }}`: {{ description }} (default: `{{ default_value }}`) {% endfor %} {% endif %} -{% if default_models %} -### Models -The following models are configured by default: -{% for model in default_models %} -- `{{ model.model_id }}` -{% endfor %} -{% endif %} +## Setting up vLLM server -## Using Docker Compose - -You can use `docker compose` to start a vLLM container and Llama Stack server container together. -```bash -$ cd distributions/{{ name }}; docker compose up -``` - -You will see outputs similar to following --- -``` - -``` - -To kill the server -```bash -docker compose down -``` - -## Starting vLLM and Llama Stack separately - -You can also decide to start a vLLM server and connect with Llama Stack manually. There are two ways to start a vLLM server and connect with Llama Stack. - -#### Start vLLM server. +Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker: ```bash -docker run --runtime nvidia --gpus all \ +export INFERENCE_PORT=8000 +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export CUDA_VISIBLE_DEVICES=0 + +docker run \ + --runtime nvidia \ + --gpus $CUDA_VISIBLE_DEVICES \ -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ --ipc=host \ vllm/vllm-openai:latest \ - --model meta-llama/Llama-3.2-3B-Instruct + --model $INFERENCE_MODEL \ + --port $INFERENCE_PORT ``` -Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) for more details. +If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: + +```bash +export SAFETY_PORT=8081 +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +export CUDA_VISIBLE_DEVICES=1 + +docker run \ + --runtime nvidia \ + --gpus $CUDA_VISIBLE_DEVICES \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ + -p $SAFETY_PORT:$SAFETY_PORT \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model $SAFETY_MODEL \ + --port $SAFETY_PORT +``` + +## Running Llama Stack + +Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \ +``` + +If you are using Llama Stack Safety / Shield APIs, use: + +```bash +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run-with-safety.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env VLLM_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT +``` + + +### Via Conda + +Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. - -#### Start Llama Stack server pointing to your vLLM server - - -We have provided a template `run.yaml` file in the `distributions/remote-vllm` directory. Please make sure to modify the `inference.provider_id` to point to your vLLM server endpoint. As an example, if your vLLM server is running on `http://127.0.0.1:8000`, your `run.yaml` file should look like the following: -```yaml -inference: - - provider_id: vllm0 - provider_type: remote::vllm - config: - url: http://127.0.0.1:8000 -``` - -**Via Conda** - -If you are using Conda, you can build and run the Llama Stack server with the following commands: ```bash -cd distributions/remote-vllm llama stack build --template remote-vllm --image-type conda -llama stack run run.yaml +llama stack run ./run.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT ``` -**Via Docker** +If you are using Llama Stack Safety / Shield APIs, use: -You can use the Llama Stack Docker image to start the server with the following command: ```bash -docker run --network host -it -p 5000:5000 \ - -v ~/.llama:/root/.llama \ - -v ./gpu/run.yaml:/root/llamastack-run-remote-vllm.yaml \ - --gpus=all \ - llamastack/distribution-remote-vllm \ - --yaml_config /root/llamastack-run-remote-vllm.yaml +llama stack run ./run-with-safety.yaml \ + --port 5001 \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env VLLM_URL=http://127.0.0.1:$INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ + --env VLLM_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT ``` diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index 00073a856..ad3c1d8e2 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -41,7 +41,6 @@ def get_distribution_template() -> DistributionTemplate: name="remote-vllm", distro_type="self_hosted", description="Use (an external) vLLM server for running LLM inference", - docker_image="llamastack/distribution-remote-vllm:test-0.0.52rc3", template_path=Path(__file__).parent / "doc_template.md", providers=providers, default_models=[inference_model, safety_model], diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md index 9b22b3b37..d4dee7fb7 100644 --- a/llama_stack/templates/tgi/doc_template.md +++ b/llama_stack/templates/tgi/doc_template.md @@ -22,13 +22,13 @@ The following environment variables can be configured: Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: ```bash -export TGI_INFERENCE_PORT=8080 +export INFERENCE_PORT=8080 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export CUDA_VISIBLE_DEVICES=0 docker run --rm -it \ -v $HOME/.cache/huggingface:/data \ - -p $TGI_INFERENCE_PORT:$TGI_INFERENCE_PORT \ + -p $INFERENCE_PORT:$INFERENCE_PORT \ --gpus $CUDA_VISIBLE_DEVICES \ ghcr.io/huggingface/text-generation-inference:2.3.1 \ --dtype bfloat16 \ @@ -36,29 +36,29 @@ docker run --rm -it \ --sharded false \ --cuda-memory-fraction 0.7 \ --model-id $INFERENCE_MODEL \ - --port $TGI_INFERENCE_PORT + --port $INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: ```bash -export TGI_SAFETY_PORT=8081 +export SAFETY_PORT=8081 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B export CUDA_VISIBLE_DEVICES=1 docker run --rm -it \ -v $HOME/.cache/huggingface:/data \ - -p $TGI_SAFETY_PORT:$TGI_SAFETY_PORT \ + -p $SAFETY_PORT:$SAFETY_PORT \ --gpus $CUDA_VISIBLE_DEVICES \ ghcr.io/huggingface/text-generation-inference:2.3.1 \ --dtype bfloat16 \ --usage-stats off \ --sharded false \ --model-id $SAFETY_MODEL \ - --port $TGI_SAFETY_PORT + --port $SAFETY_PORT ``` -## Running Llama Stack with TGI as the inference provider +## Running Llama Stack Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. @@ -69,7 +69,6 @@ This method allows you to get started quickly without having to build the distri ```bash LLAMA_STACK_PORT=5001 docker run \ - --network host \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ./run.yaml:/root/my-run.yaml \ @@ -77,14 +76,13 @@ docker run \ /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT + --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, use: ```bash docker run \ - --network host \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ./run-with-safety.yaml:/root/my-run.yaml \ @@ -92,9 +90,9 @@ docker run \ /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$TGI_INFERENCE_PORT \ + --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \ --env SAFETY_MODEL=$SAFETY_MODEL \ - --env TGI_SAFETY_URL=http://host.docker.internal:$TGI_SAFETY_PORT + --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT ``` ### Via Conda @@ -106,7 +104,7 @@ llama stack build --template {{ name }} --image-type conda llama stack run ./run.yaml --port 5001 --env INFERENCE_MODEL=$INFERENCE_MODEL - --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT + --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, use: @@ -115,7 +113,7 @@ If you are using Llama Stack Safety / Shield APIs, use: llama stack run ./run-with-safety.yaml --port 5001 --env INFERENCE_MODEL=$INFERENCE_MODEL - --env TGI_URL=http://127.0.0.1:$TGI_INFERENCE_PORT + --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT --env SAFETY_MODEL=$SAFETY_MODEL - --env TGI_SAFETY_URL=http://127.0.0.1:$TGI_SAFETY_PORT + --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT ``` diff --git a/llama_stack/templates/together/__init__.py b/llama_stack/templates/together/__init__.py new file mode 100644 index 000000000..757995b6b --- /dev/null +++ b/llama_stack/templates/together/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .together import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml index 5c149272d..a4402ba93 100644 --- a/llama_stack/templates/together/build.yaml +++ b/llama_stack/templates/together/build.yaml @@ -1,11 +1,19 @@ +version: '2' name: together distribution_spec: - description: Use Together.ai for running LLM inference + description: Use Together.AI for running LLM inference + docker_image: null providers: - inference: remote::together + inference: + - remote::together memory: - inline::faiss - - remote::weaviate - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/together/doc_template.md b/llama_stack/templates/together/doc_template.md new file mode 100644 index 000000000..667a68713 --- /dev/null +++ b/llama_stack/templates/together/doc_template.md @@ -0,0 +1,60 @@ +# Fireworks Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. + +{{ providers_table }} + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are available by default: + +{% for model in default_models %} +- `{{ model.model_id }}` +{% endfor %} +{% endif %} + + +### Prerequisite: API Keys + +Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/). + + +## Running Llama Stack with Together + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -v ./run.yaml:/root/my-run.yaml \ + llamastack/distribution-{{ name }} \ + /root/my-run.yaml \ + --port $LLAMA_STACK_PORT \ + --env TOGETHER_API_KEY=$TOGETHER_API_KEY +``` + +### Via Conda + +```bash +llama stack build --template together --image-type conda +llama stack run ./run.yaml \ + --port 5001 \ + --env TOGETHER_API_KEY=$TOGETHER_API_KEY +``` diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py new file mode 100644 index 000000000..250ef02c3 --- /dev/null +++ b/llama_stack/templates/together/together.py @@ -0,0 +1,60 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.remote.inference.together import TogetherImplConfig +from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES + +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::together"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="together", + provider_type="remote::together", + config=TogetherImplConfig.sample_run_config(), + ) + + default_models = [ModelInput(model_id=m.provider_model_id) for m in MODEL_ALIASES] + + return DistributionTemplate( + name="together", + distro_type="self_hosted", + description="Use Together.AI for running LLM inference", + docker_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=default_models, + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=default_models, + default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-1B")], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "TOGETHER_API_KEY": ( + "", + "Together.AI API Key", + ), + }, + )