diff --git a/distributions/bedrock/run.yaml b/distributions/bedrock/run.yaml deleted file mode 100644 index 2f7cb36ef..000000000 --- a/distributions/bedrock/run.yaml +++ /dev/null @@ -1,45 +0,0 @@ -version: '2' -image_name: local -name: bedrock -docker_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: bedrock0 - provider_type: remote::bedrock - config: - aws_access_key_id: - aws_secret_access_key: - aws_session_token: - region_name: - memory: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} - safety: - - provider_id: bedrock0 - provider_type: remote::bedrock - config: - aws_access_key_id: - aws_secret_access_key: - aws_session_token: - region_name: - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} diff --git a/distributions/bedrock/run.yaml b/distributions/bedrock/run.yaml new file mode 120000 index 000000000..f38abfc4e --- /dev/null +++ b/distributions/bedrock/run.yaml @@ -0,0 +1 @@ +../../llama_stack/templates/bedrock/run.yaml \ No newline at end of file diff --git a/distributions/databricks/build.yaml b/distributions/databricks/build.yaml deleted file mode 120000 index 66342fe6f..000000000 --- a/distributions/databricks/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/databricks/build.yaml \ No newline at end of file diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 92ebd1105..e7506537f 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,4 +1,32 @@ { + "hf-serverless": [ + "aiohttp", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], "together": [ "aiosqlite", "blobfile", @@ -26,6 +54,33 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], + "vllm-gpu": [ + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "vllm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], "remote-vllm": [ "aiosqlite", "blobfile", @@ -108,6 +163,33 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], + "bedrock": [ + "aiosqlite", + "blobfile", + "boto3", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], "meta-reference-gpu": [ "accelerate", "aiosqlite", @@ -167,5 +249,33 @@ "uvicorn", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "hf-endpoint": [ + "aiohttp", + "aiosqlite", + "blobfile", + "chardet", + "chromadb-client", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/distributions/hf-endpoint/build.yaml b/distributions/hf-endpoint/build.yaml deleted file mode 120000 index a73c70c05..000000000 --- a/distributions/hf-endpoint/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/hf-endpoint/build.yaml \ No newline at end of file diff --git a/distributions/hf-serverless/build.yaml b/distributions/hf-serverless/build.yaml deleted file mode 120000 index f2db0fd55..000000000 --- a/distributions/hf-serverless/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/hf-serverless/build.yaml \ No newline at end of file diff --git a/distributions/ollama-gpu/build.yaml b/distributions/ollama-gpu/build.yaml deleted file mode 120000 index 8772548e0..000000000 --- a/distributions/ollama-gpu/build.yaml +++ /dev/null @@ -1 +0,0 @@ -../../llama_stack/templates/ollama/build.yaml \ No newline at end of file diff --git a/distributions/ollama-gpu/compose.yaml b/distributions/ollama-gpu/compose.yaml deleted file mode 100644 index c965c43c7..000000000 --- a/distributions/ollama-gpu/compose.yaml +++ /dev/null @@ -1,48 +0,0 @@ -services: - ollama: - image: ollama/ollama:latest - network_mode: "host" - volumes: - - ollama:/root/.ollama # this solution synchronizes with the docker volume and loads the model rocket fast - ports: - - "11434:11434" - devices: - - nvidia.com/gpu=all - environment: - - CUDA_VISIBLE_DEVICES=0 - command: [] - deploy: - resources: - reservations: - devices: - - driver: nvidia - # that's the closest analogue to --gpus; provide - # an integer amount of devices or 'all' - count: 1 - # Devices are reserved using a list of capabilities, making - # capabilities the only required field. A device MUST - # satisfy all the requested capabilities for a successful - # reservation. - capabilities: [gpu] - runtime: nvidia - llamastack: - depends_on: - - ollama - image: llamastack/distribution-ollama - network_mode: "host" - volumes: - - ~/.llama:/root/.llama - # Link to ollama run.yaml file - - ./run.yaml:/root/llamastack-run-ollama.yaml - ports: - - "5000:5000" - # Hack: wait for ollama server to start before starting docker - entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-ollama.yaml" - deploy: - restart_policy: - condition: on-failure - delay: 3s - max_attempts: 5 - window: 60s -volumes: - ollama: diff --git a/distributions/ollama-gpu/run.yaml b/distributions/ollama-gpu/run.yaml deleted file mode 100644 index 25471c69f..000000000 --- a/distributions/ollama-gpu/run.yaml +++ /dev/null @@ -1,46 +0,0 @@ -version: '2' -image_name: local -docker_image: null -conda_env: local -apis: -- shields -- agents -- models -- memory -- memory_banks -- inference -- safety -providers: - inference: - - provider_id: ollama - provider_type: remote::ollama - config: - url: ${env.OLLAMA_URL:http://127.0.0.1:11434} - safety: - - provider_id: meta0 - provider_type: inline::llama-guard - config: - excluded_categories: [] - memory: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} - agents: - - provider_id: meta0 - provider_type: inline::meta-reference - config: - persistence_store: - namespace: null - type: sqlite - db_path: ~/.llama/runtime/kvstore.db - telemetry: - - provider_id: meta0 - provider_type: inline::meta-reference - config: {} -models: - - model_id: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct} - provider_id: ollama - - model_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B} - provider_id: ollama -shields: - - shield_id: ${env.SAFETY_MODEL:Llama-Guard-3-1B} diff --git a/distributions/inline-vllm/build.yaml b/distributions/vllm-gpu/build.yaml similarity index 100% rename from distributions/inline-vllm/build.yaml rename to distributions/vllm-gpu/build.yaml diff --git a/distributions/inline-vllm/compose.yaml b/distributions/vllm-gpu/compose.yaml similarity index 100% rename from distributions/inline-vllm/compose.yaml rename to distributions/vllm-gpu/compose.yaml diff --git a/distributions/inline-vllm/run.yaml b/distributions/vllm-gpu/run.yaml similarity index 100% rename from distributions/inline-vllm/run.yaml rename to distributions/vllm-gpu/run.yaml diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md index edef88390..1b88b01cc 100644 --- a/docs/source/distributions/self_hosted_distro/bedrock.md +++ b/docs/source/distributions/self_hosted_distro/bedrock.md @@ -6,59 +6,58 @@ self ``` -### Connect to a Llama Stack Bedrock Endpoint -- You may connect to Amazon Bedrock APIs for running LLM inference +The `llamastack/distribution-bedrock` distribution consists of the following provider configurations: -The `llamastack/distribution-bedrock` distribution consists of the following provider configurations. +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| inference | `remote::bedrock` | +| memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | +| safety | `remote::bedrock` | +| telemetry | `inline::meta-reference` | -| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | -|----------------- |--------------- |---------------- |---------------- |---------------- |---------------- | -| **Provider(s)** | remote::bedrock | meta-reference | meta-reference | remote::bedrock | meta-reference | + +### Environment Variables + +The following environment variables can be configured: + +- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `5001`) -### Docker: Start the Distribution (Single Node CPU) -> [!NOTE] -> This assumes you have valid AWS credentials configured with access to Amazon Bedrock. +### Prerequisite: API Keys -``` -$ cd distributions/bedrock && docker compose up +Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/). + + +## Running Llama Stack with AWS Bedrock + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + llamastack/distribution-bedrock \ + --port $LLAMA_STACK_PORT \ + --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN ``` -Make sure in your `run.yaml` file, your inference provider is pointing to the correct AWS configuration. E.g. -``` -inference: - - provider_id: bedrock0 - provider_type: remote::bedrock - config: - aws_access_key_id: - aws_secret_access_key: - aws_session_token: - region_name: -``` - -### Conda llama stack run (Single Node CPU) +### Via Conda ```bash llama stack build --template bedrock --image-type conda -# -- modify run.yaml with valid AWS credentials -llama stack run ./run.yaml -``` - -### (Optional) Update Model Serving Configuration - -Use `llama-stack-client models list` to check the available models served by Amazon Bedrock. - -``` -$ llama-stack-client models list -+------------------------------+------------------------------+---------------+------------+ -| identifier | llama_model | provider_id | metadata | -+==============================+==============================+===============+============+ -| Llama3.1-8B-Instruct | meta.llama3-1-8b-instruct-v1:0 | bedrock0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-70B-Instruct | meta.llama3-1-70b-instruct-v1:0 | bedrock0 | {} | -+------------------------------+------------------------------+---------------+------------+ -| Llama3.1-405B-Instruct | meta.llama3-1-405b-instruct-v1:0 | bedrock0 | {} | -+------------------------------+------------------------------+---------------+------------+ +llama stack run ./run.yaml \ + --port $LLAMA_STACK_PORT \ + --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN ``` diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index e30bb1480..096eee4f5 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -58,9 +58,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-fireworks \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY ``` @@ -70,6 +68,6 @@ docker run \ ```bash llama stack build --template fireworks --image-type conda llama stack run ./run.yaml \ - --port 5001 \ + --port $LLAMA_STACK_PORT \ --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY ``` diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index 65e1c8cf8..702f0ae0f 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -54,9 +54,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-meta-reference-gpu \ - /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct ``` @@ -67,9 +65,7 @@ If you are using Llama Stack Safety / Shield APIs, use: docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run-with-safety.yaml:/root/my-run.yaml \ llamastack/distribution-meta-reference-gpu \ - /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B @@ -81,7 +77,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a ```bash llama stack build --template meta-reference-gpu --image-type conda -llama stack run ./run.yaml \ +llama stack run distributions/meta-reference-gpu/run.yaml \ --port 5001 \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct ``` @@ -89,7 +85,7 @@ llama stack run ./run.yaml \ If you are using Llama Stack Safety / Shield APIs, use: ```bash -llama stack run ./run-with-safety.yaml \ +llama stack run distributions/meta-reference-gpu/run-with-safety.yaml \ --port 5001 \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index fe65172f3..16c936f9e 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -66,9 +66,7 @@ docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ~/.llama:/root/.llama \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-ollama \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env OLLAMA_URL=http://host.docker.internal:11434 diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index 3209b9100..a2315a770 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -85,9 +85,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-tgi \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT @@ -116,18 +114,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a ```bash llama stack build --template tgi --image-type conda llama stack run ./run.yaml - --port 5001 - --env INFERENCE_MODEL=$INFERENCE_MODEL + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, use: ```bash -llama stack run ./run-with-safety.yaml - --port 5001 - --env INFERENCE_MODEL=$INFERENCE_MODEL - --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT - --env SAFETY_MODEL=$SAFETY_MODEL +llama stack run ./run-with-safety.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT ``` diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index 303c62dcb..6e392c1e0 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -57,9 +57,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-together \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env TOGETHER_API_KEY=$TOGETHER_API_KEY ``` @@ -69,6 +67,6 @@ docker run \ ```bash llama stack build --template together --image-type conda llama stack run ./run.yaml \ - --port 5001 \ + --port $LLAMA_STACK_PORT \ --env TOGETHER_API_KEY=$TOGETHER_API_KEY ``` diff --git a/llama_stack/providers/inline/inference/vllm/config.py b/llama_stack/providers/inline/inference/vllm/config.py index 8a95298f4..42b75332f 100644 --- a/llama_stack/providers/inline/inference/vllm/config.py +++ b/llama_stack/providers/inline/inference/vllm/config.py @@ -37,11 +37,11 @@ class VLLMConfig(BaseModel): @classmethod def sample_run_config(cls): return { - "model": "${env.VLLM_INFERENCE_MODEL:Llama3.2-3B-Instruct}", - "tensor_parallel_size": "${env.VLLM_TENSOR_PARALLEL_SIZE:1}", - "max_tokens": "${env.VLLM_MAX_TOKENS:4096}", - "enforce_eager": "${env.VLLM_ENFORCE_EAGER:False}", - "gpu_memory_utilization": "${env.VLLM_GPU_MEMORY_UTILIZATION:0.3}", + "model": "${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}", + "tensor_parallel_size": "${env.TENSOR_PARALLEL_SIZE:1}", + "max_tokens": "${env.MAX_TOKENS:4096}", + "enforce_eager": "${env.ENFORCE_EAGER:False}", + "gpu_memory_utilization": "${env.GPU_MEMORY_UTILIZATION:0.7}", } @field_validator("model") diff --git a/llama_stack/providers/remote/inference/bedrock/config.py b/llama_stack/providers/remote/inference/bedrock/config.py index 8e194700c..f2e8930be 100644 --- a/llama_stack/providers/remote/inference/bedrock/config.py +++ b/llama_stack/providers/remote/inference/bedrock/config.py @@ -4,11 +4,8 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. -from llama_models.schema_utils import json_schema_type - from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig -@json_schema_type class BedrockConfig(BedrockBaseConfig): pass diff --git a/llama_stack/providers/remote/inference/tgi/config.py b/llama_stack/providers/remote/inference/tgi/config.py index 55bda4179..230eaacab 100644 --- a/llama_stack/providers/remote/inference/tgi/config.py +++ b/llama_stack/providers/remote/inference/tgi/config.py @@ -37,6 +37,18 @@ class InferenceEndpointImplConfig(BaseModel): description="Your Hugging Face user access token (will default to locally saved token if not provided)", ) + @classmethod + def sample_run_config( + cls, + endpoint_name: str = "${env.INFERENCE_ENDPOINT_NAME}", + api_token: str = "${env.HF_API_TOKEN}", + **kwargs, + ): + return { + "endpoint_name": endpoint_name, + "api_token": api_token, + } + @json_schema_type class InferenceAPIImplConfig(BaseModel): @@ -47,3 +59,15 @@ class InferenceAPIImplConfig(BaseModel): default=None, description="Your Hugging Face user access token (will default to locally saved token if not provided)", ) + + @classmethod + def sample_run_config( + cls, + repo: str = "${env.INFERENCE_MODEL}", + api_token: str = "${env.HF_API_TOKEN}", + **kwargs, + ): + return { + "huggingface_repo": repo, + "api_token": api_token, + } diff --git a/llama_stack/providers/utils/bedrock/config.py b/llama_stack/providers/utils/bedrock/config.py index 55c5582a1..64865bd5f 100644 --- a/llama_stack/providers/utils/bedrock/config.py +++ b/llama_stack/providers/utils/bedrock/config.py @@ -5,11 +5,9 @@ # the root directory of this source tree. from typing import Optional -from llama_models.schema_utils import json_schema_type from pydantic import BaseModel, Field -@json_schema_type class BedrockBaseConfig(BaseModel): aws_access_key_id: Optional[str] = Field( default=None, @@ -57,3 +55,7 @@ class BedrockBaseConfig(BaseModel): default=3600, description="The time in seconds till a session expires. The default is 3600 seconds (1 hour).", ) + + @classmethod + def sample_run_config(cls, **kwargs): + return {} diff --git a/llama_stack/templates/bedrock/__init__.py b/llama_stack/templates/bedrock/__init__.py new file mode 100644 index 000000000..4e7965550 --- /dev/null +++ b/llama_stack/templates/bedrock/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .bedrock import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py new file mode 100644 index 000000000..cf3c342fe --- /dev/null +++ b/llama_stack/templates/bedrock/bedrock.py @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::bedrock"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["remote::bedrock"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + return DistributionTemplate( + name="bedrock", + distro_type="self_hosted", + description="Use AWS Bedrock for running LLM inference and safety", + docker_image=None, + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=[], + run_configs={ + "run.yaml": RunConfigSettings(), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + }, + ) diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml index c87762043..c73db3eae 100644 --- a/llama_stack/templates/bedrock/build.yaml +++ b/llama_stack/templates/bedrock/build.yaml @@ -1,9 +1,19 @@ +version: '2' name: bedrock distribution_spec: - description: Use Amazon Bedrock APIs. + description: Use AWS Bedrock for running LLM inference and safety + docker_image: null providers: - inference: remote::bedrock - memory: inline::faiss - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + inference: + - remote::bedrock + memory: + - inline::faiss + - remote::chromadb + - remote::pgvector + safety: + - remote::bedrock + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/bedrock/doc_template.md b/llama_stack/templates/bedrock/doc_template.md new file mode 100644 index 000000000..9331382b6 --- /dev/null +++ b/llama_stack/templates/bedrock/doc_template.md @@ -0,0 +1,63 @@ +# Bedrock Distribution + +The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations: + +{{ providers_table }} + + +{% if run_config_env_vars %} +### Environment Variables + +The following environment variables can be configured: + +{% for var, (default_value, description) in run_config_env_vars.items() %} +- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) +{% endfor %} +{% endif %} + +{% if default_models %} +### Models + +The following models are available by default: + +{% for model in default_models %} +- `{{ model.model_id }} ({{ model.provider_model_id }})` +{% endfor %} +{% endif %} + + +### Prerequisite: API Keys + +Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/). + + +## Running Llama Stack with AWS Bedrock + +You can do this via Conda (build code) or Docker which has a pre-built image. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=5001 +docker run \ + -it \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + llamastack/distribution-{{ name }} \ + --port $LLAMA_STACK_PORT \ + --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN +``` + +### Via Conda + +```bash +llama stack build --template {{ name }} --image-type conda +llama stack run ./run.yaml \ + --port $LLAMA_STACK_PORT \ + --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN +``` diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml new file mode 100644 index 000000000..1f632a1f2 --- /dev/null +++ b/llama_stack/templates/bedrock/run.yaml @@ -0,0 +1,49 @@ +version: '2' +image_name: bedrock +docker_image: null +conda_env: bedrock +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: bedrock + provider_type: remote::bedrock + config: {} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/faiss_store.db + safety: + - provider_id: bedrock + provider_type: remote::bedrock + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/registry.db +models: [] +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/llama_stack/templates/databricks/build.yaml b/llama_stack/templates/databricks/build.yaml deleted file mode 100644 index aa22f54b2..000000000 --- a/llama_stack/templates/databricks/build.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: databricks -distribution_spec: - description: Use Databricks for running LLM inference - providers: - inference: remote::databricks - memory: inline::faiss - safety: inline::llama-guard - agents: meta-reference - telemetry: meta-reference diff --git a/llama_stack/templates/fireworks/doc_template.md b/llama_stack/templates/fireworks/doc_template.md index 2a91ece07..2f4be574d 100644 --- a/llama_stack/templates/fireworks/doc_template.md +++ b/llama_stack/templates/fireworks/doc_template.md @@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-{{ name }} \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY ``` @@ -55,6 +53,6 @@ docker run \ ```bash llama stack build --template fireworks --image-type conda llama stack run ./run.yaml \ - --port 5001 \ + --port $LLAMA_STACK_PORT \ --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY ``` diff --git a/llama_stack/templates/hf-endpoint/__init__.py b/llama_stack/templates/hf-endpoint/__init__.py new file mode 100644 index 000000000..f2c00e3bf --- /dev/null +++ b/llama_stack/templates/hf-endpoint/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .hf_endpoint import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml index 61fd12a2c..798cb3961 100644 --- a/llama_stack/templates/hf-endpoint/build.yaml +++ b/llama_stack/templates/hf-endpoint/build.yaml @@ -1,9 +1,19 @@ +version: '2' name: hf-endpoint distribution_spec: - description: "Like local, but use Hugging Face Inference Endpoints for running LLM inference.\nSee https://hf.co/docs/api-endpoints." + description: Use (an external) Hugging Face Inference Endpoint for running LLM inference + docker_image: null providers: - inference: remote::hf::endpoint - memory: inline::faiss - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + inference: + - remote::hf::endpoint + memory: + - inline::faiss + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py new file mode 100644 index 000000000..af00114ba --- /dev/null +++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py @@ -0,0 +1,97 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::hf::endpoint"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="hf-endpoint", + provider_type="remote::hf::endpoint", + config=InferenceEndpointImplConfig.sample_run_config(), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="hf-endpoint", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="hf-endpoint-safety", + ) + + return DistributionTemplate( + name="hf-endpoint", + distro_type="self_hosted", + description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", + docker_image=None, + template_path=None, + providers=providers, + default_models=[inference_model, safety_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + Provider( + provider_id="hf-endpoint-safety", + provider_type="remote::hf::endpoint", + config=InferenceEndpointImplConfig.sample_run_config( + endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}", + ), + ), + ] + }, + default_models=[ + inference_model, + safety_model, + ], + default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "HF_API_TOKEN": ( + "hf_...", + "Hugging Face API token", + ), + "INFERENCE_ENDPOINT_NAME": ( + "", + "HF Inference endpoint name for the main inference model", + ), + "SAFETY_INFERENCE_ENDPOINT_NAME": ( + "", + "HF Inference endpoint for the safety model", + ), + "INFERENCE_MODEL": ( + "meta-llama/Llama-3.2-3B-Instruct", + "Inference model served by the HF Inference Endpoint", + ), + "SAFETY_MODEL": ( + "meta-llama/Llama-Guard-3-1B", + "Safety model served by the HF Inference Endpoint", + ), + }, + ) diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml new file mode 100644 index 000000000..d518f29b8 --- /dev/null +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -0,0 +1,68 @@ +version: '2' +image_name: hf-endpoint +docker_image: null +conda_env: hf-endpoint +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: hf-endpoint + provider_type: remote::hf::endpoint + config: + endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} + api_token: ${env.HF_API_TOKEN} + - provider_id: hf-endpoint-safety + provider_type: remote::hf::endpoint + config: + endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME} + api_token: ${env.HF_API_TOKEN} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: hf-endpoint + provider_model_id: null +- metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: hf-endpoint-safety + provider_model_id: null +shields: +- params: null + shield_id: ${env.SAFETY_MODEL} + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml new file mode 100644 index 000000000..ff4e90606 --- /dev/null +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -0,0 +1,55 @@ +version: '2' +image_name: hf-endpoint +docker_image: null +conda_env: hf-endpoint +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: hf-endpoint + provider_type: remote::hf::endpoint + config: + endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} + api_token: ${env.HF_API_TOKEN} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: hf-endpoint + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/llama_stack/templates/hf-serverless/__init__.py b/llama_stack/templates/hf-serverless/__init__.py new file mode 100644 index 000000000..a5f1ab54a --- /dev/null +++ b/llama_stack/templates/hf-serverless/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .hf_serverless import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml index 065a14517..3c03a98c1 100644 --- a/llama_stack/templates/hf-serverless/build.yaml +++ b/llama_stack/templates/hf-serverless/build.yaml @@ -1,9 +1,19 @@ +version: '2' name: hf-serverless distribution_spec: - description: "Like local, but use Hugging Face Inference API (serverless) for running LLM inference.\nSee https://hf.co/docs/api-inference." + description: Use (an external) Hugging Face Inference Endpoint for running LLM inference + docker_image: null providers: - inference: remote::hf::serverless - memory: inline::faiss - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference + inference: + - remote::hf::serverless + memory: + - inline::faiss + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py new file mode 100644 index 000000000..5434de986 --- /dev/null +++ b/llama_stack/templates/hf-serverless/hf_serverless.py @@ -0,0 +1,89 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["remote::hf::serverless"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="hf-serverless", + provider_type="remote::hf::serverless", + config=InferenceAPIImplConfig.sample_run_config(), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="hf-serverless", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="hf-serverless-safety", + ) + + return DistributionTemplate( + name="hf-serverless", + distro_type="self_hosted", + description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", + docker_image=None, + template_path=None, + providers=providers, + default_models=[inference_model, safety_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + Provider( + provider_id="hf-serverless-safety", + provider_type="remote::hf::serverless", + config=InferenceAPIImplConfig.sample_run_config( + repo="${env.SAFETY_MODEL}", + ), + ), + ] + }, + default_models=[ + inference_model, + safety_model, + ], + default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "HF_API_TOKEN": ( + "hf_...", + "Hugging Face API token", + ), + "INFERENCE_MODEL": ( + "meta-llama/Llama-3.2-3B-Instruct", + "Inference model to be served by the HF Serverless endpoint", + ), + "SAFETY_MODEL": ( + "meta-llama/Llama-Guard-3-1B", + "Safety model to be served by the HF Serverless endpoint", + ), + }, + ) diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml new file mode 100644 index 000000000..e7591bbf0 --- /dev/null +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -0,0 +1,68 @@ +version: '2' +image_name: hf-serverless +docker_image: null +conda_env: hf-serverless +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: hf-serverless + provider_type: remote::hf::serverless + config: + huggingface_repo: ${env.INFERENCE_MODEL} + api_token: ${env.HF_API_TOKEN} + - provider_id: hf-serverless-safety + provider_type: remote::hf::serverless + config: + huggingface_repo: ${env.SAFETY_MODEL} + api_token: ${env.HF_API_TOKEN} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: hf-serverless + provider_model_id: null +- metadata: {} + model_id: ${env.SAFETY_MODEL} + provider_id: hf-serverless-safety + provider_model_id: null +shields: +- params: null + shield_id: ${env.SAFETY_MODEL} + provider_id: null + provider_shield_id: null +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml new file mode 100644 index 000000000..d7ec02f6a --- /dev/null +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -0,0 +1,55 @@ +version: '2' +image_name: hf-serverless +docker_image: null +conda_env: hf-serverless +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: hf-serverless + provider_type: remote::hf::serverless + config: + huggingface_repo: ${env.INFERENCE_MODEL} + api_token: ${env.HF_API_TOKEN} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: hf-serverless + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/llama_stack/templates/inline-vllm/build.yaml b/llama_stack/templates/inline-vllm/build.yaml deleted file mode 100644 index 61d9e4db8..000000000 --- a/llama_stack/templates/inline-vllm/build.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name: meta-reference-gpu -distribution_spec: - docker_image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime - description: Use code from `llama_stack` itself to serve all llama stack APIs - providers: - inference: inline::meta-reference - memory: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference diff --git a/llama_stack/templates/meta-reference-gpu/doc_template.md b/llama_stack/templates/meta-reference-gpu/doc_template.md index 9a61ff691..de09efdb0 100644 --- a/llama_stack/templates/meta-reference-gpu/doc_template.md +++ b/llama_stack/templates/meta-reference-gpu/doc_template.md @@ -40,9 +40,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-{{ name }} \ - /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct ``` @@ -53,9 +51,7 @@ If you are using Llama Stack Safety / Shield APIs, use: docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run-with-safety.yaml:/root/my-run.yaml \ llamastack/distribution-{{ name }} \ - /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B @@ -66,8 +62,8 @@ docker run \ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. ```bash -llama stack build --template meta-reference-gpu --image-type conda -llama stack run ./run.yaml \ +llama stack build --template {{ name }} --image-type conda +llama stack run distributions/{{ name }}/run.yaml \ --port 5001 \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct ``` @@ -75,7 +71,7 @@ llama stack run ./run.yaml \ If you are using Llama Stack Safety / Shield APIs, use: ```bash -llama stack run ./run-with-safety.yaml \ +llama stack run distributions/{{ name }}/run-with-safety.yaml \ --port 5001 \ --env INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ --env SAFETY_MODEL=meta-llama/Llama-Guard-3-1B diff --git a/llama_stack/templates/meta-reference-quantized-gpu/__init__.py b/llama_stack/templates/meta-reference-quantized-gpu/__init__.py new file mode 100644 index 000000000..1cfdb2c6a --- /dev/null +++ b/llama_stack/templates/meta-reference-quantized-gpu/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .meta_reference import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md new file mode 100644 index 000000000..afe1e3e20 --- /dev/null +++ b/llama_stack/templates/meta-reference-quantized-gpu/doc_template.md @@ -0,0 +1,54 @@ +# Meta Reference Quantized Distribution + +The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists of the following provider configurations. + + +| **API** | **Inference** | **Agents** | **Memory** | **Safety** | **Telemetry** | +|----------------- |------------------------ |---------------- |-------------------------------------------------- |---------------- |---------------- | +| **Provider(s)** | meta-reference-quantized | meta-reference | meta-reference, remote::pgvector, remote::chroma | meta-reference | meta-reference | + +The only difference vs. the `meta-reference-gpu` distribution is that it has support for more efficient inference -- with fp8, int4 quantization, etc. + +### Step 0. Prerequisite - Downloading Models +Please make sure you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](https://llama-stack.readthedocs.io/en/latest/cli_reference/download_models.html) here to download the models. + +``` +$ ls ~/.llama/checkpoints +Llama3.2-3B-Instruct:int4-qlora-eo8 +``` + +### Step 1. Start the Distribution +#### (Option 1) Start with Docker +``` +$ cd distributions/meta-reference-quantized-gpu && docker compose up +``` + +> [!NOTE] +> This assumes you have access to GPU to start a local server with access to your GPU. + + +> [!NOTE] +> `~/.llama` should be the path containing downloaded weights of Llama models. + + +This will download and start running a pre-built docker container. Alternatively, you may use the following commands: + +``` +docker run -it -p 5000:5000 -v ~/.llama:/root/.llama -v ./run.yaml:/root/my-run.yaml --gpus=all distribution-meta-reference-quantized-gpu --yaml_config /root/my-run.yaml +``` + +#### (Option 2) Start with Conda + +1. Install the `llama` CLI. See [CLI Reference](https://llama-stack.readthedocs.io/en/latest/cli_reference/index.html) + +2. Build the `meta-reference-quantized-gpu` distribution + +``` +$ llama stack build --template meta-reference-quantized-gpu --image-type conda +``` + +3. Start running distribution +``` +$ cd distributions/meta-reference-quantized-gpu +$ llama stack run ./run.yaml +``` diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py new file mode 100644 index 000000000..f254bc920 --- /dev/null +++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py @@ -0,0 +1,100 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from pathlib import Path + +from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.inference.meta_reference import ( + MetaReferenceInferenceConfig, +) +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["inline::meta-reference"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="meta-reference-inference", + provider_type="inline::meta-reference", + config=MetaReferenceInferenceConfig.sample_run_config( + model="${env.INFERENCE_MODEL}", + checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", + ), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="meta-reference-inference", + ) + safety_model = ModelInput( + model_id="${env.SAFETY_MODEL}", + provider_id="meta-reference-safety", + ) + + return DistributionTemplate( + name="meta-reference-gpu", + distro_type="self_hosted", + description="Use Meta Reference for running LLM inference", + template_path=Path(__file__).parent / "doc_template.md", + providers=providers, + default_models=[inference_model, safety_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], + ), + "run-with-safety.yaml": RunConfigSettings( + provider_overrides={ + "inference": [ + inference_provider, + Provider( + provider_id="meta-reference-safety", + provider_type="inline::meta-reference", + config=MetaReferenceInferenceConfig.sample_run_config( + model="${env.SAFETY_MODEL}", + checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:null}", + ), + ), + ], + }, + default_models=[ + inference_model, + safety_model, + ], + default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "INFERENCE_MODEL": ( + "meta-llama/Llama-3.2-3B-Instruct", + "Inference model loaded into the Meta Reference server", + ), + "INFERENCE_CHECKPOINT_DIR": ( + "null", + "Directory containing the Meta Reference model checkpoint", + ), + "SAFETY_MODEL": ( + "meta-llama/Llama-Guard-3-1B", + "Name of the safety (Llama-Guard) model to use", + ), + "SAFETY_CHECKPOINT_DIR": ( + "null", + "Directory containing the Llama-Guard model checkpoint", + ), + }, + ) diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md index 5a7a0d2f7..09fe8eabc 100644 --- a/llama_stack/templates/ollama/doc_template.md +++ b/llama_stack/templates/ollama/doc_template.md @@ -55,9 +55,7 @@ docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ -v ~/.llama:/root/.llama \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-{{ name }} \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env OLLAMA_URL=http://host.docker.internal:11434 @@ -86,7 +84,7 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a ```bash export LLAMA_STACK_PORT=5001 -llama stack build --template ollama --image-type conda +llama stack build --template {{ name }} --image-type conda llama stack run ./run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index fe0278718..bf74b95d1 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -27,7 +27,7 @@ from llama_stack.providers.utils.kvstore.config import SqliteKVStoreConfig class RunConfigSettings(BaseModel): provider_overrides: Dict[str, List[Provider]] = Field(default_factory=dict) - default_models: List[ModelInput] + default_models: Optional[List[ModelInput]] = None default_shields: Optional[List[ShieldInput]] = None def run_config( @@ -87,7 +87,7 @@ class RunConfigSettings(BaseModel): __distro_dir__=f"distributions/{name}", db_name="registry.db", ), - models=self.default_models, + models=self.default_models or [], shields=self.default_shields or [], ) @@ -104,7 +104,7 @@ class DistributionTemplate(BaseModel): providers: Dict[str, List[str]] run_configs: Dict[str, RunConfigSettings] - template_path: Path + template_path: Optional[Path] = None # Optional configuration run_config_env_vars: Optional[Dict[str, Tuple[str, str]]] = None @@ -159,6 +159,7 @@ class DistributionTemplate(BaseModel): with open(yaml_output_dir / yaml_pth, "w") as f: yaml.safe_dump(run_config.model_dump(), f, sort_keys=False) - docs = self.generate_markdown_docs() - with open(doc_output_dir / f"{self.name}.md", "w") as f: - f.write(docs if docs.endswith("\n") else docs + "\n") + if self.template_path: + docs = self.generate_markdown_docs() + with open(doc_output_dir / f"{self.name}.md", "w") as f: + f.write(docs if docs.endswith("\n") else docs + "\n") diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md index 0f6001e1a..42124696f 100644 --- a/llama_stack/templates/tgi/doc_template.md +++ b/llama_stack/templates/tgi/doc_template.md @@ -71,9 +71,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-{{ name }} \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT @@ -102,18 +100,18 @@ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI a ```bash llama stack build --template {{ name }} --image-type conda llama stack run ./run.yaml - --port 5001 - --env INFERENCE_MODEL=$INFERENCE_MODEL + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT ``` If you are using Llama Stack Safety / Shield APIs, use: ```bash -llama stack run ./run-with-safety.yaml - --port 5001 - --env INFERENCE_MODEL=$INFERENCE_MODEL - --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT - --env SAFETY_MODEL=$SAFETY_MODEL +llama stack run ./run-with-safety.yaml \ + --port $LLAMA_STACK_PORT \ + --env INFERENCE_MODEL=$INFERENCE_MODEL \ + --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \ + --env SAFETY_MODEL=$SAFETY_MODEL \ --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT ``` diff --git a/llama_stack/templates/together/doc_template.md b/llama_stack/templates/together/doc_template.md index 5c1580dac..3fc94dd35 100644 --- a/llama_stack/templates/together/doc_template.md +++ b/llama_stack/templates/together/doc_template.md @@ -43,9 +43,7 @@ LLAMA_STACK_PORT=5001 docker run \ -it \ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ llamastack/distribution-{{ name }} \ - --yaml-config /root/my-run.yaml \ --port $LLAMA_STACK_PORT \ --env TOGETHER_API_KEY=$TOGETHER_API_KEY ``` @@ -53,8 +51,8 @@ docker run \ ### Via Conda ```bash -llama stack build --template together --image-type conda +llama stack build --template {{ name }} --image-type conda llama stack run ./run.yaml \ - --port 5001 \ + --port $LLAMA_STACK_PORT \ --env TOGETHER_API_KEY=$TOGETHER_API_KEY ``` diff --git a/llama_stack/templates/vllm-gpu/__init__.py b/llama_stack/templates/vllm-gpu/__init__.py new file mode 100644 index 000000000..7b3d59a01 --- /dev/null +++ b/llama_stack/templates/vllm-gpu/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .vllm import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml new file mode 100644 index 000000000..6792a855f --- /dev/null +++ b/llama_stack/templates/vllm-gpu/build.yaml @@ -0,0 +1,19 @@ +version: '2' +name: vllm-gpu +distribution_spec: + description: Use a built-in vLLM engine for running LLM inference + docker_image: null + providers: + inference: + - inline::vllm + memory: + - inline::faiss + - remote::chromadb + - remote::pgvector + safety: + - inline::llama-guard + agents: + - inline::meta-reference + telemetry: + - inline::meta-reference +image_type: conda diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml new file mode 100644 index 000000000..a140ad403 --- /dev/null +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -0,0 +1,58 @@ +version: '2' +image_name: vllm-gpu +docker_image: null +conda_env: vllm-gpu +apis: +- agents +- inference +- memory +- safety +- telemetry +providers: + inference: + - provider_id: vllm + provider_type: inline::vllm + config: + model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct} + tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1} + max_tokens: ${env.MAX_TOKENS:4096} + enforce_eager: ${env.ENFORCE_EAGER:False} + gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7} + memory: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: {} + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + namespace: null + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} +metadata_store: + namespace: null + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db +models: +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: vllm + provider_model_id: null +shields: [] +memory_banks: [] +datasets: [] +scoring_fns: [] +eval_tasks: [] diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py new file mode 100644 index 000000000..78fcf4f57 --- /dev/null +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -0,0 +1,74 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.distribution.datatypes import ModelInput, Provider +from llama_stack.providers.inline.inference.vllm import VLLMConfig +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings + + +def get_distribution_template() -> DistributionTemplate: + providers = { + "inference": ["inline::vllm"], + "memory": ["inline::faiss", "remote::chromadb", "remote::pgvector"], + "safety": ["inline::llama-guard"], + "agents": ["inline::meta-reference"], + "telemetry": ["inline::meta-reference"], + } + + inference_provider = Provider( + provider_id="vllm", + provider_type="inline::vllm", + config=VLLMConfig.sample_run_config(), + ) + + inference_model = ModelInput( + model_id="${env.INFERENCE_MODEL}", + provider_id="vllm", + ) + + return DistributionTemplate( + name="vllm-gpu", + distro_type="self_hosted", + description="Use a built-in vLLM engine for running LLM inference", + docker_image=None, + template_path=None, + providers=providers, + default_models=[inference_model], + run_configs={ + "run.yaml": RunConfigSettings( + provider_overrides={ + "inference": [inference_provider], + }, + default_models=[inference_model], + ), + }, + run_config_env_vars={ + "LLAMASTACK_PORT": ( + "5001", + "Port for the Llama Stack distribution server", + ), + "INFERENCE_MODEL": ( + "meta-llama/Llama-3.2-3B-Instruct", + "Inference model loaded into the vLLM engine", + ), + "TENSOR_PARALLEL_SIZE": ( + "1", + "Number of tensor parallel replicas (number of GPUs to use).", + ), + "MAX_TOKENS": ( + "4096", + "Maximum number of tokens to generate.", + ), + "ENFORCE_EAGER": ( + "False", + "Whether to use eager mode for inference (otherwise cuda graphs are used).", + ), + "GPU_MEMORY_UTILIZATION": ( + "0.7", + "GPU memory utilization for the vLLM engine.", + ), + }, + )