diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 32e221128..ceafa96db 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -43,14 +43,17 @@ jobs: - name: Build Llama Stack run: | - uv run llama stack build --template ollama --image-type venv + uv run llama stack build --template starter --image-type venv - name: Start Llama Stack server in background if: matrix.client-type == 'http' env: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + OLLAMA_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + ENABLE_OLLAMA: "ollama" + OLLAMA_URL: "http://0.0.0.0:11434" run: | - LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/ollama/run.yaml --image-type venv --env OLLAMA_URL="http://0.0.0.0:11434" & + LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run ./llama_stack/templates/starter/run.yaml --image-type venv & - name: Wait for Llama Stack server to be ready if: matrix.client-type == 'http' @@ -87,16 +90,18 @@ jobs: - name: Run Integration Tests env: INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + OLLAMA_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" # for library tests + ENABLE_OLLAMA: "ollama" # for library tests OLLAMA_URL: "http://0.0.0.0:11434" run: | if [ "${{ matrix.client-type }}" == "library" ]; then - stack_config="ollama" + stack_config="starter" else stack_config="http://localhost:8321" fi uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ - --text-model="meta-llama/Llama-3.2-3B-Instruct" \ + --text-model="ollama/meta-llama/Llama-3.2-3B-Instruct" \ --embedding-model=all-MiniLM-L6-v2 - name: Check Storage and Memory Available After Tests diff --git a/README.md b/README.md index 7f34c3340..dc30a9cd2 100644 --- a/README.md +++ b/README.md @@ -139,13 +139,7 @@ A Llama Stack Distribution (or "distro") is a pre-configured bundle of provider | **Distribution** | **Llama Stack Docker** | Start This Distribution | |:---------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------:| | Meta Reference | [llamastack/distribution-meta-reference-gpu](https://hub.docker.com/repository/docker/llamastack/distribution-meta-reference-gpu/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/meta-reference-gpu.html) | -| SambaNova | [llamastack/distribution-sambanova](https://hub.docker.com/repository/docker/llamastack/distribution-sambanova/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/sambanova.html) | -| Cerebras | [llamastack/distribution-cerebras](https://hub.docker.com/repository/docker/llamastack/distribution-cerebras/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/cerebras.html) | -| Ollama | [llamastack/distribution-ollama](https://hub.docker.com/repository/docker/llamastack/distribution-ollama/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/ollama.html) | -| TGI | [llamastack/distribution-tgi](https://hub.docker.com/repository/docker/llamastack/distribution-tgi/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/tgi.html) | -| Together | [llamastack/distribution-together](https://hub.docker.com/repository/docker/llamastack/distribution-together/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/together.html) | -| Fireworks | [llamastack/distribution-fireworks](https://hub.docker.com/repository/docker/llamastack/distribution-fireworks/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/fireworks.html) | -| vLLM | [llamastack/distribution-remote-vllm](https://hub.docker.com/repository/docker/llamastack/distribution-remote-vllm/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) | +| Starter Distribution | [llamastack/distribution-starter](https://hub.docker.com/repository/docker/llamastack/distribution-starter/general) | [Guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/starter.html) | ### Documentation diff --git a/docs/source/distributions/building_distro.md b/docs/source/distributions/building_distro.md index 521071cc6..8df808b54 100644 --- a/docs/source/distributions/building_distro.md +++ b/docs/source/distributions/building_distro.md @@ -85,45 +85,13 @@ The following command will allow you to see the available templates and their co llama stack build --list-templates ``` -``` -------------------------------+-----------------------------------------------------------------------------+ -| Template Name | Description | -+------------------------------+-----------------------------------------------------------------------------+ -| hf-serverless | Use (an external) Hugging Face Inference Endpoint for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| together | Use Together.AI for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| vllm-gpu | Use a built-in vLLM engine for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| experimental-post-training | Experimental template for post training | -+------------------------------+-----------------------------------------------------------------------------+ -| remote-vllm | Use (an external) vLLM server for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| fireworks | Use Fireworks.AI for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| tgi | Use (an external) TGI server for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| bedrock | Use AWS Bedrock for running LLM inference and safety | -+------------------------------+-----------------------------------------------------------------------------+ -| meta-reference-gpu | Use Meta Reference for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| nvidia | Use NVIDIA NIM for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| cerebras | Use Cerebras for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| ollama | Use (an external) Ollama server for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -| hf-endpoint | Use (an external) Hugging Face Inference Endpoint for running LLM inference | -+------------------------------+-----------------------------------------------------------------------------+ -``` - You may then pick a template to build your distribution with providers fitted to your liking. For example, to build a distribution with TGI as the inference provider, you can run: ``` -$ llama stack build --template tgi +$ llama stack build --template starter ... -You can now edit ~/.llama/distributions/llamastack-tgi/tgi-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-tgi/tgi-run.yaml` +You can now edit ~/.llama/distributions/llamastack-starter/starter-run.yaml and run `llama stack run ~/.llama/distributions/llamastack-starter/starter-run.yaml` ``` ::: :::{tab-item} Building from Scratch @@ -163,26 +131,7 @@ You can now edit ~/.llama/distributions/llamastack-my-local-stack/my-local-stack - The config file will be of contents like the ones in `llama_stack/templates/*build.yaml`. ``` -$ cat llama_stack/templates/ollama/build.yaml - -name: ollama -distribution_spec: - description: Like local, but use ollama for running LLM inference - providers: - inference: remote::ollama - memory: inline::faiss - safety: inline::llama-guard - agents: inline::meta-reference - telemetry: inline::meta-reference -image_name: ollama -image_type: conda - -# If some providers are external, you can specify the path to the implementation -external_providers_dir: ~/.llama/providers.d -``` - -``` -llama stack build --config llama_stack/templates/ollama/build.yaml +llama stack build --config llama_stack/templates/starter/build.yaml ``` ::: @@ -248,11 +197,11 @@ Podman is supported as an alternative to Docker. Set `CONTAINER_BINARY` to `podm To build a container image, you may start off from a template and use the `--image-type container` flag to specify `container` as the build image type. ``` -llama stack build --template ollama --image-type container +llama stack build --template starter --image-type container ``` ``` -$ llama stack build --template ollama --image-type container +$ llama stack build --template starter --image-type container ... Containerfile created successfully in /tmp/tmp.viA3a3Rdsg/ContainerfileFROM python:3.10-slim ... diff --git a/docs/source/distributions/importing_as_library.md b/docs/source/distributions/importing_as_library.md index 967a18b54..fe82d2db5 100644 --- a/docs/source/distributions/importing_as_library.md +++ b/docs/source/distributions/importing_as_library.md @@ -6,7 +6,7 @@ This avoids the overhead of setting up a server. ```bash # setup uv pip install llama-stack -llama stack build --template ollama --image-type venv +llama stack build --template starter --image-type venv ``` ```python diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md deleted file mode 100644 index d7aedbfb2..000000000 --- a/docs/source/distributions/self_hosted_distro/bedrock.md +++ /dev/null @@ -1,79 +0,0 @@ - -# Bedrock Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-bedrock` distribution consists of the following provider configurations: - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| inference | `remote::bedrock` | -| safety | `remote::bedrock` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) - -### Models - -The following models are available by default: - -- `meta.llama3-1-8b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-8B-Instruct)` -- `meta.llama3-1-70b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-70B-Instruct)` -- `meta.llama3-1-405b-instruct-v1:0 (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)` - - -### Prerequisite: API Keys - -Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/). - - -## Running Llama Stack with AWS Bedrock - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-bedrock \ - --port $LLAMA_STACK_PORT \ - --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \ - --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION -``` - -### Via Conda - -```bash -llama stack build --template bedrock --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \ - --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION -``` diff --git a/docs/source/distributions/self_hosted_distro/cerebras.md b/docs/source/distributions/self_hosted_distro/cerebras.md deleted file mode 100644 index 3c4db1b75..000000000 --- a/docs/source/distributions/self_hosted_distro/cerebras.md +++ /dev/null @@ -1,67 +0,0 @@ - -# Cerebras Distribution - -The `llamastack/distribution-cerebras` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| inference | `remote::cerebras`, `inline::sentence-transformers` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `CEREBRAS_API_KEY`: Cerebras API Key (default: ``) - -### Models - -The following models are available by default: - -- `llama3.1-8b (aliases: meta-llama/Llama-3.1-8B-Instruct)` -- `llama-3.3-70b (aliases: meta-llama/Llama-3.3-70B-Instruct)` - - -### Prerequisite: API Keys - -Make sure you have access to a Cerebras API Key. You can get one by visiting [cloud.cerebras.ai](https://cloud.cerebras.ai/). - - -## Running Llama Stack with Cerebras - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ - llamastack/distribution-cerebras \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template cerebras --image-type conda -llama stack run ./run.yaml \ - --port 8321 \ - --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY -``` diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md deleted file mode 100644 index e09666e13..000000000 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -orphan: true ---- - -# Fireworks Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-fireworks` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| files | `inline::localfs` | -| inference | `remote::fireworks`, `inline::sentence-transformers` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::rag-runtime`, `remote::model-context-protocol` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `FIREWORKS_API_KEY`: Fireworks.AI API Key (default: ``) - -### Models - -The following models are available by default: - -- `accounts/fireworks/models/llama-v3p1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)` -- `accounts/fireworks/models/llama-v3p1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)` -- `accounts/fireworks/models/llama-v3p1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)` -- `accounts/fireworks/models/llama-v3p2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)` -- `accounts/fireworks/models/llama-v3p2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` -- `accounts/fireworks/models/llama-v3p2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` -- `accounts/fireworks/models/llama-v3p3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)` -- `accounts/fireworks/models/llama-guard-3-8b (aliases: meta-llama/Llama-Guard-3-8B)` -- `accounts/fireworks/models/llama-guard-3-11b-vision (aliases: meta-llama/Llama-Guard-3-11B-Vision)` -- `accounts/fireworks/models/llama4-scout-instruct-basic (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` -- `accounts/fireworks/models/llama4-maverick-instruct-basic (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` -- `nomic-ai/nomic-embed-text-v1.5 ` - - -### Prerequisite: API Keys - -Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/). - - -## Running Llama Stack with Fireworks - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-fireworks \ - --port $LLAMA_STACK_PORT \ - --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template fireworks --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY -``` diff --git a/docs/source/distributions/self_hosted_distro/groq.md b/docs/source/distributions/self_hosted_distro/groq.md deleted file mode 100644 index 1b2194ad8..000000000 --- a/docs/source/distributions/self_hosted_distro/groq.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -orphan: true ---- - -# Groq Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-groq` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| inference | `remote::groq` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime` | -| vector_io | `inline::faiss` | - - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `GROQ_API_KEY`: Groq API Key (default: ``) - -### Models - -The following models are available by default: - -- `groq/llama3-8b-8192 (aliases: meta-llama/Llama-3.1-8B-Instruct)` -- `groq/llama-3.1-8b-instant ` -- `groq/llama3-70b-8192 (aliases: meta-llama/Llama-3-70B-Instruct)` -- `groq/llama-3.3-70b-versatile (aliases: meta-llama/Llama-3.3-70B-Instruct)` -- `groq/llama-3.2-3b-preview (aliases: meta-llama/Llama-3.2-3B-Instruct)` -- `groq/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` -- `groq/meta-llama/llama-4-scout-17b-16e-instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` -- `groq/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` -- `groq/meta-llama/llama-4-maverick-17b-128e-instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` - - -### Prerequisite: API Keys - -Make sure you have access to a Groq API Key. You can get one by visiting [Groq](https://api.groq.com/). - - -## Running Llama Stack with Groq - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-groq \ - --port $LLAMA_STACK_PORT \ - --env GROQ_API_KEY=$GROQ_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template groq --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env GROQ_API_KEY=$GROQ_API_KEY -``` diff --git a/docs/source/distributions/self_hosted_distro/nvidia.md b/docs/source/distributions/self_hosted_distro/nvidia.md deleted file mode 100644 index 47e38f73d..000000000 --- a/docs/source/distributions/self_hosted_distro/nvidia.md +++ /dev/null @@ -1,177 +0,0 @@ - -# NVIDIA Distribution - -The `llamastack/distribution-nvidia` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `inline::localfs`, `remote::nvidia` | -| eval | `remote::nvidia` | -| inference | `remote::nvidia` | -| post_training | `remote::nvidia` | -| safety | `remote::nvidia` | -| scoring | `inline::basic` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `inline::rag-runtime` | -| vector_io | `inline::faiss` | - - -### Environment Variables - -The following environment variables can be configured: - -- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``) -- `NVIDIA_APPEND_API_VERSION`: Whether to append the API version to the base_url (default: `True`) -- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`) -- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`) -- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`) -- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`) -- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`) -- `NVIDIA_GUARDRAILS_CONFIG_ID`: NVIDIA Guardrail Configuration ID (default: `self-check`) -- `NVIDIA_EVALUATOR_URL`: URL for the NeMo Evaluator Service (default: `http://0.0.0.0:7331`) -- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`) -- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`) - -### Models - -The following models are available by default: - -- `meta/llama3-8b-instruct (aliases: meta-llama/Llama-3-8B-Instruct)` -- `meta/llama3-70b-instruct (aliases: meta-llama/Llama-3-70B-Instruct)` -- `meta/llama-3.1-8b-instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)` -- `meta/llama-3.1-70b-instruct (aliases: meta-llama/Llama-3.1-70B-Instruct)` -- `meta/llama-3.1-405b-instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)` -- `meta/llama-3.2-1b-instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)` -- `meta/llama-3.2-3b-instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)` -- `meta/llama-3.2-11b-vision-instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` -- `meta/llama-3.2-90b-vision-instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` -- `meta/llama-3.3-70b-instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)` -- `nvidia/llama-3.2-nv-embedqa-1b-v2 ` -- `nvidia/nv-embedqa-e5-v5 ` -- `nvidia/nv-embedqa-mistral-7b-v2 ` -- `snowflake/arctic-embed-l ` - - -## Prerequisites -### NVIDIA API Keys - -Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable. - -### Deploy NeMo Microservices Platform -The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform. - -## Supported Services -Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints. - -### Inference: NVIDIA NIM -NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs: - 1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key) - 2. Self-hosted: NVIDIA NIMs that run on your own infrastructure. - -The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment. - -### Datasetio API: NeMo Data Store -The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint. - -See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage. - -### Eval API: NeMo Evaluator -The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint. - -See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage. - -### Post-Training API: NeMo Customizer -The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint. - -See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage. - -### Safety API: NeMo Guardrails -The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint. - -See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage. - -## Deploying models -In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`. - -Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart. -```sh -# URL to NeMo NIM Proxy service -export NEMO_URL="http://nemo.test" - -curl --location "$NEMO_URL/v1/deployment/model-deployments" \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "name": "llama-3.2-1b-instruct", - "namespace": "meta", - "config": { - "model": "meta/llama-3.2-1b-instruct", - "nim_deployment": { - "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct", - "image_tag": "1.8.3", - "pvc_size": "25Gi", - "gpu": 1, - "additional_envs": { - "NIM_GUIDED_DECODING_BACKEND": "fast_outlines" - } - } - } - }' -``` -This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference. - -You can also remove a deployed NIM to free up GPU resources, if needed. -```sh -export NEMO_URL="http://nemo.test" - -curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct" -``` - -## Running Llama Stack with NVIDIA - -You can do this via Conda or venv (build code), or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ - llamastack/distribution-nvidia \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env NVIDIA_API_KEY=$NVIDIA_API_KEY -``` - -### Via Conda - -```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct -llama stack build --template nvidia --image-type conda -llama stack run ./run.yaml \ - --port 8321 \ - --env NVIDIA_API_KEY=$NVIDIA_API_KEY \ - --env INFERENCE_MODEL=$INFERENCE_MODEL -``` - -### Via venv - -If you've set up your local development environment, you can also build the image using your local virtual environment. - -```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct -llama stack build --template nvidia --image-type venv -llama stack run ./run.yaml \ - --port 8321 \ - --env NVIDIA_API_KEY=$NVIDIA_API_KEY \ - --env INFERENCE_MODEL=$INFERENCE_MODEL -``` - -## Example Notebooks -For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`. diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md deleted file mode 100644 index e09c79359..000000000 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ /dev/null @@ -1,165 +0,0 @@ ---- -orphan: true ---- - -# Ollama Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-ollama` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| files | `inline::localfs` | -| inference | `remote::ollama` | -| post_training | `inline::huggingface` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - -You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `OLLAMA_URL`: URL of the Ollama server (default: `http://127.0.0.1:11434`) -- `INFERENCE_MODEL`: Inference model loaded into the Ollama server (default: `meta-llama/Llama-3.2-3B-Instruct`) -- `SAFETY_MODEL`: Safety model loaded into the Ollama server (default: `meta-llama/Llama-Guard-3-1B`) - - -## Setting up Ollama server - -Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server. - -In order to load models, you can run: - -```bash -export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" - -# ollama names this model differently, and we must use the ollama name when loading the model -export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16" -ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m -``` - -If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model. - -```bash -export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" - -# ollama names this model differently, and we must use the ollama name when loading the model -export OLLAMA_SAFETY_MODEL="llama-guard3:1b" -ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m -``` - -## Running Llama Stack - -Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -export LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - llamastack/distribution-ollama \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env OLLAMA_URL=http://host.docker.internal:11434 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \ - llamastack/distribution-ollama \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env OLLAMA_URL=http://host.docker.internal:11434 -``` - -### Via Conda - -Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. - -```bash -export LLAMA_STACK_PORT=8321 - -llama stack build --template ollama --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env OLLAMA_URL=http://localhost:11434 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -llama stack run ./run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env OLLAMA_URL=http://localhost:11434 -``` - - -### (Optional) Update Model Serving Configuration - -```{note} -Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models. -``` - -To serve a new model with `ollama` -```bash -ollama run -``` - -To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama. -``` -$ ollama ps -NAME ID SIZE PROCESSOR UNTIL -llama3.2:3b-instruct-fp16 195a8c01d91e 8.6 GB 100% GPU 9 minutes from now -``` - -To verify that the model served by ollama is correctly connected to Llama Stack server -```bash -$ llama-stack-client models list - -Available Models - -┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓ -┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃ -┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩ -│ llm │ meta-llama/Llama-3.2-3B-Instruct │ llama3.2:3b-instruct-fp16 │ │ ollama │ -└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘ - -Total models: 1 -``` diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md deleted file mode 100644 index 6e7cf410d..000000000 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ /dev/null @@ -1,297 +0,0 @@ ---- -orphan: true ---- - -# Remote vLLM Distribution -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-remote-vllm` distribution consists of the following provider configurations: - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| inference | `remote::vllm`, `inline::sentence-transformers` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - -You can use this distribution if you want to run an independent vLLM server for inference. - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `INFERENCE_MODEL`: Inference model loaded into the vLLM server (default: `meta-llama/Llama-3.2-3B-Instruct`) -- `VLLM_URL`: URL of the vLLM server with the main inference model (default: `http://host.docker.internal:5100/v1`) -- `MAX_TOKENS`: Maximum number of tokens for generation (default: `4096`) -- `SAFETY_VLLM_URL`: URL of the vLLM server with the safety model (default: `http://host.docker.internal:5101/v1`) -- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) - - -## Setting up vLLM server - -In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM -server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also -[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and -that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging. - -### Setting up vLLM server on AMD GPU - -AMD provides two main vLLM container options: -- rocm/vllm: Production-ready container -- rocm/vllm-dev: Development container with the latest vLLM features - -Please check the [Blog about ROCm vLLM Usage](https://rocm.blogs.amd.com/software-tools-optimization/vllm-container/README.html) to get more details. - -Here is a sample script to start a ROCm vLLM server locally via Docker: - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export CUDA_VISIBLE_DEVICES=0 -export VLLM_DIMG="rocm/vllm-dev:main" - -docker run \ - --pull always \ - --ipc=host \ - --privileged \ - --shm-size 16g \ - --device=/dev/kfd \ - --device=/dev/dri \ - --group-add video \ - --cap-add=SYS_PTRACE \ - --cap-add=CAP_SYS_ADMIN \ - --security-opt seccomp=unconfined \ - --security-opt apparmor=unconfined \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - $VLLM_DIMG \ - python -m vllm.entrypoints.openai.api_server \ - --model $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html). - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export CUDA_VISIBLE_DEVICES=1 -export VLLM_DIMG="rocm/vllm-dev:main" - -docker run \ - --pull always \ - --ipc=host \ - --privileged \ - --shm-size 16g \ - --device=/dev/kfd \ - --device=/dev/dri \ - --group-add video \ - --cap-add=SYS_PTRACE \ - --cap-add=CAP_SYS_ADMIN \ - --security-opt seccomp=unconfined \ - --security-opt apparmor=unconfined \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ - -p $SAFETY_PORT:$SAFETY_PORT \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - $VLLM_DIMG \ - python -m vllm.entrypoints.openai.api_server \ - --model $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -### Setting up vLLM server on NVIDIA GPU - -Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker: - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export CUDA_VISIBLE_DEVICES=0 - -docker run \ - --pull always \ - --runtime nvidia \ - --gpus $CUDA_VISIBLE_DEVICES \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - --ipc=host \ - vllm/vllm-openai:latest \ - --gpu-memory-utilization 0.7 \ - --model $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html). - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export CUDA_VISIBLE_DEVICES=1 - -docker run \ - --pull always \ - --runtime nvidia \ - --gpus $CUDA_VISIBLE_DEVICES \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - -p $SAFETY_PORT:$SAFETY_PORT \ - --ipc=host \ - vllm/vllm-openai:latest \ - --gpu-memory-utilization 0.7 \ - --model $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -### Setting up vLLM server on Intel GPU - -Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend: -- [intel/vllm](https://hub.docker.com/r/intel/vllm) - -Here is a sample script to start a vLLM server locally via Docker using Intel provided container: - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct -export ZE_AFFINITY_MASK=0 - -docker run \ - --pull always \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - --ipc=host \ - intel/vllm:xpu \ - --gpu-memory-utilization 0.7 \ - --model $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export ZE_AFFINITY_MASK=1 - -docker run \ - --pull always \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \ - -p $SAFETY_PORT:$SAFETY_PORT \ - --ipc=host \ - intel/vllm:xpu \ - --gpu-memory-utilization 0.7 \ - --model $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -## Running Llama Stack - -Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export LLAMA_STACK_PORT=8321 - -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \ - llamastack/distribution-remote-vllm \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B - -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \ - llamastack/distribution-remote-vllm \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env SAFETY_VLLM_URL=http://host.docker.internal:$SAFETY_PORT/v1 -``` - - -### Via Conda - -Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export LLAMA_STACK_PORT=8321 - -cd distributions/remote-vllm -llama stack build --template remote-vllm --image-type conda - -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B - -llama stack run ./run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1 \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env SAFETY_VLLM_URL=http://localhost:$SAFETY_PORT/v1 -``` diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md deleted file mode 100644 index bb4842362..000000000 --- a/docs/source/distributions/self_hosted_distro/sambanova.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -orphan: true ---- - -# SambaNova Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-sambanova` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| inference | `remote::sambanova`, `inline::sentence-transformers` | -| safety | `remote::sambanova` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMASTACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `SAMBANOVA_API_KEY`: SambaNova API Key (default: ``) - -### Models - -The following models are available by default: - -- `sambanova/Meta-Llama-3.1-8B-Instruct (aliases: meta-llama/Llama-3.1-8B-Instruct)` -- `sambanova/Meta-Llama-3.1-405B-Instruct (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)` -- `sambanova/Meta-Llama-3.2-1B-Instruct (aliases: meta-llama/Llama-3.2-1B-Instruct)` -- `sambanova/Meta-Llama-3.2-3B-Instruct (aliases: meta-llama/Llama-3.2-3B-Instruct)` -- `sambanova/Meta-Llama-3.3-70B-Instruct (aliases: meta-llama/Llama-3.3-70B-Instruct)` -- `sambanova/Llama-3.2-11B-Vision-Instruct (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` -- `sambanova/Llama-3.2-90B-Vision-Instruct (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` -- `sambanova/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct)` -- `sambanova/Llama-4-Maverick-17B-128E-Instruct (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct)` -- `sambanova/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)` - - -### Prerequisite: API Keys - -Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](http://cloud.sambanova.ai?utm_source=llamastack&utm_medium=external&utm_campaign=cloud_signup). - - -## Running Llama Stack with SambaNova - -You can do this via Conda (build code) or Docker which has a pre-built image. - - -### Via Docker - -```bash -LLAMA_STACK_PORT=8321 -llama stack build --template sambanova --image-type container -docker run \ - -it \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - distribution-sambanova \ - --port $LLAMA_STACK_PORT \ - --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY -``` - - -### Via Venv - -```bash -llama stack build --template sambanova --image-type venv -llama stack run --image-type venv ~/.llama/distributions/sambanova/sambanova-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY -``` - - -### Via Conda - -```bash -llama stack build --template sambanova --image-type conda -llama stack run --image-type conda ~/.llama/distributions/sambanova/sambanova-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY -``` diff --git a/docs/source/distributions/self_hosted_distro/starter.md b/docs/source/distributions/self_hosted_distro/starter.md new file mode 100644 index 000000000..730ccf165 --- /dev/null +++ b/docs/source/distributions/self_hosted_distro/starter.md @@ -0,0 +1,134 @@ +--- +orphan: true +--- + +# Starter Distribution + +```{toctree} +:maxdepth: 2 +:hidden: + +self +``` + +The `llamastack/distribution-starter` distribution is a comprehensive, multi-provider distribution that includes most of the available inference providers in Llama Stack. It's designed to be a one-stop solution for developers who want to experiment with different AI providers without having to configure each one individually. + +## Provider Composition + +The starter distribution consists of the following configurations: + +| API | Provider(s) | +|-----|-------------| +| agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | +| files | `inline::localfs` | +| inference | `remote::openai`, `remote::fireworks`, `remote::together`, `remote::ollama`, `remote::anthropic`, `remote::gemini`, `remote::groq`, `remote::sambanova`, `remote::vllm`, `remote::tgi`, `remote::cerebras`, `remote::llama-openai-compat`, `remote::nvidia`, `remote::hf::serverless`, `remote::hf::endpoint`, `inline::sentence-transformers`, `remote::passthrough` | +| safety | `inline::llama-guard` | +| post_training | `inline::huggingface` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | +| telemetry | `inline::meta-reference` | +| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` | +| vector_io | `inline::faiss`, `inline::sqlite-vec`, `remote::chromadb`, `remote::pgvector` | + +## Inference Providers + +The starter distribution includes a comprehensive set of inference providers: + +- **OpenAI**: GPT-4, GPT-3.5, O1, O3, O4 models and text embeddings - point to the relevant provider + configuration documentation for more details +- **Fireworks**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models and embeddings +- **Together**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models and embeddings +- **Anthropic**: Claude 3.5 Sonnet, Claude 3.7 Sonnet, Claude 3.5 Haiku, and Voyage embeddings +- **Gemini**: Gemini 1.5, 2.0, 2.5 models and text embeddings +- **Groq**: Fast Llama models (3.1, 3.2, 3.3, 4 Scout, 4 Maverick) +- **SambaNova**: Llama 3.1, 3.2, 3.3, 4 Scout, 4 Maverick models +- **Cerebras**: Cerebras AI models +- **NVIDIA**: NVIDIA NIM models +- **HuggingFace**: Serverless and endpoint models +- **Bedrock**: AWS Bedrock models +- **Passthrough**: Passthrough provider - use this to connect to any other inference provider that is not supported by Llama Stack +- **Ollama**: Local Ollama models +- **vLLM**: remote vLLM server +- **TGI**: Text Generation Inference server - Dell Enterprise Hub's custom TGI container too (use `DEH_URL`) +- **Sentence Transformers**: Local embedding models + +All providers are **disabled** by default. So you need to enable them by setting the environment +variables. See [Enabling Providers](#enabling-providers) for more details. + +## Vector Providers + +The starter distribution includes a comprehensive set of vector providers: + +- **FAISS**: Local FAISS vector store - enabled by default +- **SQLite**: Local SQLite vector store - disabled by default +- **ChromaDB**: Remote ChromaDB server - disabled by default +- **PGVector**: Remote PGVector server - disabled by default + +## Enabling Providers + +You can enable specific providers by setting their provider ID to a string value using environment +variables. + +For instance, to enable the Ollama provider, you can set the `ENABLE_OLLAMA` environment variable to `ollama`. + +```bash +export ENABLE_OLLAMA=ollama +``` + +To disable a provider, you can set the environment variable to `ENABLE_OLLAMA=__disabled__`. + +## Running the Distribution + +You can run the starter distribution via Docker or directly using the Llama Stack CLI. + +### Via Docker + +This method allows you to get started quickly without having to build the distribution code. + +```bash +LLAMA_STACK_PORT=8321 +docker run \ + -it \ + --pull always \ + -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ + -e ENABLE_OLLAMA=ollama \ + -e OLLAMA_INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \ + llamastack/distribution-starter \ + --port $LLAMA_STACK_PORT +``` + +You can also use the `llama stack run` command to run the distribution. + +```bash +llama stack run distributions/starter/run.yaml \ + --port 8321 \ + --env ENABLE_OLLAMA=ollama \ + --env OLLAMA_INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +``` + +## Storage + +The starter distribution uses SQLite for local storage of various components: + +- **Metadata store**: `~/.llama/distributions/starter/registry.db` +- **Inference store**: `~/.llama/distributions/starter/inference_store.db` +- **FAISS store**: `~/.llama/distributions/starter/faiss_store.db` +- **SQLite vector store**: `~/.llama/distributions/starter/sqlite_vec.db` +- **Files metadata**: `~/.llama/distributions/starter/files_metadata.db` +- **Agents store**: `~/.llama/distributions/starter/agents_store.db` +- **Responses store**: `~/.llama/distributions/starter/responses_store.db` +- **Trace store**: `~/.llama/distributions/starter/trace_store.db` +- **Evaluation store**: `~/.llama/distributions/starter/meta_reference_eval.db` +- **Dataset I/O stores**: Various HuggingFace and local filesystem stores + +## Benefits of the Starter Distribution + +1. **Comprehensive Coverage**: Includes most popular AI providers in one distribution +2. **Flexible Configuration**: Easy to enable/disable providers based on your needs +3. **No Local GPU Required**: Most providers are cloud-based, making it accessible to developers without high-end hardware +4. **Easy Migration**: Start with hosted providers and gradually move to local ones as needed +5. **Production Ready**: Includes safety, evaluation, and telemetry components +6. **Tool Integration**: Comes with web search, RAG, and model context protocol tools + +The starter distribution is ideal for developers who want to experiment with different AI providers, build prototypes quickly, or create applications that can work with multiple AI backends. diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md deleted file mode 100644 index 24f9d03ec..000000000 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -orphan: true ---- - - -# TGI Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-tgi` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| inference | `remote::tgi`, `inline::sentence-transformers` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - -You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference. - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `INFERENCE_MODEL`: Inference model loaded into the TGI server (default: `meta-llama/Llama-3.2-3B-Instruct`) -- `TGI_URL`: URL of the TGI server with the main inference model (default: `http://127.0.0.1:8080/v1`) -- `TGI_SAFETY_URL`: URL of the TGI server with the safety model (default: `http://127.0.0.1:8081/v1`) -- `SAFETY_MODEL`: Name of the safety (Llama-Guard) model to use (default: `meta-llama/Llama-Guard-3-1B`) - - -## Setting up TGI server - -Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: - -```bash -export INFERENCE_PORT=8080 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export CUDA_VISIBLE_DEVICES=0 - -docker run --rm -it \ - --pull always \ - -v $HOME/.cache/huggingface:/data \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - --gpus $CUDA_VISIBLE_DEVICES \ - ghcr.io/huggingface/text-generation-inference:2.3.1 \ - --dtype bfloat16 \ - --usage-stats off \ - --sharded false \ - --cuda-memory-fraction 0.7 \ - --model-id $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export CUDA_VISIBLE_DEVICES=1 - -docker run --rm -it \ - --pull always \ - -v $HOME/.cache/huggingface:/data \ - -p $SAFETY_PORT:$SAFETY_PORT \ - --gpus $CUDA_VISIBLE_DEVICES \ - ghcr.io/huggingface/text-generation-inference:2.3.1 \ - --dtype bfloat16 \ - --usage-stats off \ - --sharded false \ - --model-id $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -## Running Llama Stack - -Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-tgi \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \ - llamastack/distribution-tgi \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT -``` - -### Via Conda - -Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. - -```bash -llama stack build --template tgi --image-type conda -llama stack run ./run.yaml - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -llama stack run ./run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT -``` diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md deleted file mode 100644 index adfc2c472..000000000 --- a/docs/source/distributions/self_hosted_distro/together.md +++ /dev/null @@ -1,86 +0,0 @@ ---- -orphan: true ---- - -# Together Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-together` distribution consists of the following provider configurations. - -| API | Provider(s) | -|-----|-------------| -| agents | `inline::meta-reference` | -| datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | -| inference | `remote::together`, `inline::sentence-transformers` | -| safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | -| telemetry | `inline::meta-reference` | -| tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | -| vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | - - -### Environment Variables - -The following environment variables can be configured: - -- `LLAMA_STACK_PORT`: Port for the Llama Stack distribution server (default: `8321`) -- `TOGETHER_API_KEY`: Together.AI API Key (default: ``) - -### Models - -The following models are available by default: - -- `meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-8B-Instruct)` -- `meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-70B-Instruct)` -- `meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo (aliases: meta-llama/Llama-3.1-405B-Instruct-FP8)` -- `meta-llama/Llama-3.2-3B-Instruct-Turbo (aliases: meta-llama/Llama-3.2-3B-Instruct)` -- `meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo (aliases: meta-llama/Llama-3.2-11B-Vision-Instruct)` -- `meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo (aliases: meta-llama/Llama-3.2-90B-Vision-Instruct)` -- `meta-llama/Llama-3.3-70B-Instruct-Turbo (aliases: meta-llama/Llama-3.3-70B-Instruct)` -- `meta-llama/Meta-Llama-Guard-3-8B (aliases: meta-llama/Llama-Guard-3-8B)` -- `meta-llama/Llama-Guard-3-11B-Vision-Turbo (aliases: meta-llama/Llama-Guard-3-11B-Vision)` -- `togethercomputer/m2-bert-80M-8k-retrieval ` -- `togethercomputer/m2-bert-80M-32k-retrieval ` -- `meta-llama/Llama-4-Scout-17B-16E-Instruct (aliases: meta-llama/Llama-4-Scout-17B-16E-Instruct, together/meta-llama/Llama-4-Scout-17B-16E-Instruct)` -- `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 (aliases: meta-llama/Llama-4-Maverick-17B-128E-Instruct, together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)` - - -### Prerequisite: API Keys - -Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/). - - -## Running Llama Stack with Together - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-together \ - --port $LLAMA_STACK_PORT \ - --env TOGETHER_API_KEY=$TOGETHER_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template together --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env TOGETHER_API_KEY=$TOGETHER_API_KEY -``` diff --git a/docs/source/getting_started/detailed_tutorial.md b/docs/source/getting_started/detailed_tutorial.md index e40a4903a..d80ec3554 100644 --- a/docs/source/getting_started/detailed_tutorial.md +++ b/docs/source/getting_started/detailed_tutorial.md @@ -58,7 +58,7 @@ which defines the providers and their settings. Now let's build and run the Llama Stack config for Ollama. ```bash -INFERENCE_MODEL=llama3.2:3b llama stack build --template ollama --image-type venv --run +INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type venv --run ``` ::: :::{tab-item} Using `conda` @@ -69,7 +69,7 @@ which defines the providers and their settings. Now let's build and run the Llama Stack config for Ollama. ```bash -INFERENCE_MODEL=llama3.2:3b llama stack build --template ollama --image-type conda --image-name llama3-3b-conda --run +INFERENCE_MODEL=llama3.2:3b llama stack build --template starter --image-type conda --image-name llama3-3b-conda --run ``` ::: :::{tab-item} Using a Container diff --git a/docs/source/getting_started/index.md b/docs/source/getting_started/index.md index ee7cdd4a9..418a30eb3 100644 --- a/docs/source/getting_started/index.md +++ b/docs/source/getting_started/index.md @@ -17,7 +17,7 @@ ollama run llama3.2:3b --keepalive 60m #### Step 2: Run the Llama Stack server We will use `uv` to run the Llama Stack server. ```bash -INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template ollama --image-type venv --run +INFERENCE_MODEL=llama3.2:3b uv run --with llama-stack llama stack build --template starter --image-type venv --run ``` #### Step 3: Run the demo Now open up a new terminal and copy the following script into a file named `demo_script.py`. diff --git a/docs/source/providers/post_training/huggingface.md b/docs/source/providers/post_training/huggingface.md index c342203a8..c7896aaf4 100644 --- a/docs/source/providers/post_training/huggingface.md +++ b/docs/source/providers/post_training/huggingface.md @@ -23,7 +23,7 @@ To use the HF SFTTrainer in your Llama Stack project, follow these steps: You can access the HuggingFace trainer via the `ollama` distribution: ```bash -llama stack build --template ollama --image-type venv +llama stack build --template starter --image-type venv llama stack run --image-type venv ~/.llama/distributions/ollama/ollama-run.yaml ``` diff --git a/docs/zero_to_hero_guide/README.md b/docs/zero_to_hero_guide/README.md index 96f9768de..19c171de2 100644 --- a/docs/zero_to_hero_guide/README.md +++ b/docs/zero_to_hero_guide/README.md @@ -83,7 +83,7 @@ If you're looking for more specific topics, we have a [Zero to Hero Guide](#next 1. **Build the Llama Stack**: Build the Llama Stack using the `ollama` template: ```bash - llama stack build --template ollama --image-type conda + llama stack build --template starter --image-type conda ``` **Expected Output:** ```bash diff --git a/llama_stack/distribution/providers.py b/llama_stack/distribution/providers.py index 1d9c1f4e9..7095ffd18 100644 --- a/llama_stack/distribution/providers.py +++ b/llama_stack/distribution/providers.py @@ -84,7 +84,13 @@ class ProviderImpl(Providers): Each API maps to a dictionary of provider IDs to their health responses. """ providers_health: dict[str, dict[str, HealthResponse]] = {} - timeout = 1.0 + + # The timeout has to be long enough to allow all the providers to be checked, especially in + # the case of the inference router health check since it checks all registered inference + # providers. + # The timeout must not be equal to the one set by health method for a given implementation, + # otherwise we will miss some providers. + timeout = 3.0 async def check_provider_health(impl: Any) -> tuple[str, HealthResponse] | None: # Skip special implementations (inspect/providers) that don't have provider specs diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index c86880669..3b82afd05 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -98,6 +98,10 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]): method = getattr(impls[api], register_method) for obj in objects: + # Do not register models on disabled providers + if hasattr(obj, "provider_id") and obj.provider_id is not None and obj.provider_id == "__disabled__": + logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled provider.") + continue # In complex templates, like our starter template, we may have dynamic model ids # given by environment variables. This allows those environment variables to have # a default value of __disabled__ to skip registration of the model if not set. @@ -106,6 +110,7 @@ async def register_resources(run_config: StackRunConfig, impls: dict[Api, Any]): and obj.provider_model_id is not None and "__disabled__" in obj.provider_model_id ): + logger.debug(f"Skipping {rsrc.capitalize()} registration for disabled model.") continue # we want to maintain the type information in arguments to method. # instead of method(**obj.model_dump()), which may convert a typed attr to a dict, diff --git a/llama_stack/providers/remote/inference/cerebras/config.py b/llama_stack/providers/remote/inference/cerebras/config.py index 81312ec76..5ad7376fc 100644 --- a/llama_stack/providers/remote/inference/cerebras/config.py +++ b/llama_stack/providers/remote/inference/cerebras/config.py @@ -26,8 +26,8 @@ class CerebrasImplConfig(BaseModel): ) @classmethod - def sample_run_config(cls, **kwargs) -> dict[str, Any]: + def sample_run_config(cls, api_key: str = "${env.CEREBRAS_API_KEY}", **kwargs) -> dict[str, Any]: return { "base_url": DEFAULT_BASE_URL, - "api_key": "${env.CEREBRAS_API_KEY}", + "api_key": api_key, } diff --git a/llama_stack/providers/remote/inference/llama_openai_compat/config.py b/llama_stack/providers/remote/inference/llama_openai_compat/config.py index 57bc7240d..339d241ca 100644 --- a/llama_stack/providers/remote/inference/llama_openai_compat/config.py +++ b/llama_stack/providers/remote/inference/llama_openai_compat/config.py @@ -31,7 +31,7 @@ class LlamaCompatConfig(BaseModel): ) @classmethod - def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY}", **kwargs) -> dict[str, Any]: + def sample_run_config(cls, api_key: str = "${env.LLAMA_API_KEY:}", **kwargs) -> dict[str, Any]: return { "openai_compat_api_base": "https://api.llama.com/compat/v1/", "api_key": api_key, diff --git a/llama_stack/providers/remote/inference/nvidia/config.py b/llama_stack/providers/remote/inference/nvidia/config.py index 6369928bb..c7f6f4ba6 100644 --- a/llama_stack/providers/remote/inference/nvidia/config.py +++ b/llama_stack/providers/remote/inference/nvidia/config.py @@ -53,9 +53,15 @@ class NVIDIAConfig(BaseModel): ) @classmethod - def sample_run_config(cls, **kwargs) -> dict[str, Any]: + def sample_run_config( + cls, + url: str = "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}", + api_key: str = "${env.NVIDIA_API_KEY:+}", + append_api_version: bool = "${env.NVIDIA_APPEND_API_VERSION:=True}", + **kwargs, + ) -> dict[str, Any]: return { - "url": "${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}", - "api_key": "${env.NVIDIA_API_KEY:+}", - "append_api_version": "${env.NVIDIA_APPEND_API_VERSION:=True}", + "url": url, + "api_key": api_key, + "append_api_version": append_api_version, } diff --git a/llama_stack/providers/remote/inference/ollama/config.py b/llama_stack/providers/remote/inference/ollama/config.py index b2cc4d8a7..0145810a8 100644 --- a/llama_stack/providers/remote/inference/ollama/config.py +++ b/llama_stack/providers/remote/inference/ollama/config.py @@ -13,13 +13,9 @@ DEFAULT_OLLAMA_URL = "http://localhost:11434" class OllamaImplConfig(BaseModel): url: str = DEFAULT_OLLAMA_URL - raise_on_connect_error: bool = True @classmethod - def sample_run_config( - cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", raise_on_connect_error: bool = True, **kwargs - ) -> dict[str, Any]: + def sample_run_config(cls, url: str = "${env.OLLAMA_URL:=http://localhost:11434}", **kwargs) -> dict[str, Any]: return { "url": url, - "raise_on_connect_error": raise_on_connect_error, } diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py index e9df0dcc8..10d11a617 100644 --- a/llama_stack/providers/remote/inference/ollama/ollama.py +++ b/llama_stack/providers/remote/inference/ollama/ollama.py @@ -91,7 +91,6 @@ class OllamaInferenceAdapter( def __init__(self, config: OllamaImplConfig) -> None: self.register_helper = ModelRegistryHelper(MODEL_ENTRIES) self.url = config.url - self.raise_on_connect_error = config.raise_on_connect_error @property def client(self) -> AsyncClient: @@ -105,10 +104,7 @@ class OllamaInferenceAdapter( logger.debug(f"checking connectivity to Ollama at `{self.url}`...") health_response = await self.health() if health_response["status"] == HealthStatus.ERROR: - if self.raise_on_connect_error: - raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal") - else: - logger.warning("Ollama Server is not running, start it using `ollama serve` in a separate terminal") + raise RuntimeError("Ollama Server is not running, start it using `ollama serve` in a separate terminal") async def health(self) -> HealthResponse: """ diff --git a/llama_stack/providers/remote/inference/passthrough/config.py b/llama_stack/providers/remote/inference/passthrough/config.py index ce41495ce..647b2db46 100644 --- a/llama_stack/providers/remote/inference/passthrough/config.py +++ b/llama_stack/providers/remote/inference/passthrough/config.py @@ -24,8 +24,10 @@ class PassthroughImplConfig(BaseModel): ) @classmethod - def sample_run_config(cls, **kwargs) -> dict[str, Any]: + def sample_run_config( + cls, url: str = "${env.PASSTHROUGH_URL}", api_key: str = "${env.PASSTHROUGH_API_KEY}", **kwargs + ) -> dict[str, Any]: return { - "url": "${env.PASSTHROUGH_URL}", - "api_key": "${env.PASSTHROUGH_API_KEY}", + "url": url, + "api_key": api_key, } diff --git a/llama_stack/providers/remote/inference/tgi/config.py b/llama_stack/providers/remote/inference/tgi/config.py index 3d632c9d8..d4448871f 100644 --- a/llama_stack/providers/remote/inference/tgi/config.py +++ b/llama_stack/providers/remote/inference/tgi/config.py @@ -17,7 +17,11 @@ class TGIImplConfig(BaseModel): ) @classmethod - def sample_run_config(cls, url: str = "${env.TGI_URL}", **kwargs): + def sample_run_config( + cls, + url: str = "${env.TGI_URL}", + **kwargs, + ): return { "url": url, } diff --git a/llama_stack/providers/remote/inference/tgi/tgi.py b/llama_stack/providers/remote/inference/tgi/tgi.py index 292d74ef8..031200d4a 100644 --- a/llama_stack/providers/remote/inference/tgi/tgi.py +++ b/llama_stack/providers/remote/inference/tgi/tgi.py @@ -327,7 +327,6 @@ class InferenceEndpointAdapter(_HfAdapter): # Get the inference endpoint details api = HfApi(token=config.api_token.get_secret_value()) endpoint = api.get_inference_endpoint(config.endpoint_name) - # Wait for the endpoint to be ready (if not already) endpoint.wait(timeout=60) diff --git a/llama_stack/templates/bedrock/__init__.py b/llama_stack/templates/bedrock/__init__.py deleted file mode 100644 index 4e7965550..000000000 --- a/llama_stack/templates/bedrock/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .bedrock import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py deleted file mode 100644 index bc3a9304f..000000000 --- a/llama_stack/templates/bedrock/bedrock.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.distribution.datatypes import Provider, ToolGroupInput -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.bedrock.models import MODEL_ENTRIES -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::bedrock"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["remote::bedrock"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - name = "bedrock" - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - available_models = { - "bedrock": MODEL_ENTRIES, - } - default_models = get_model_registry(available_models) - - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use AWS Bedrock for running LLM inference and safety", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "vector_io": [vector_io_provider], - }, - default_models=default_models, - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - }, - ) diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml deleted file mode 100644 index 1a2c883fa..000000000 --- a/llama_stack/templates/bedrock/build.yaml +++ /dev/null @@ -1,34 +0,0 @@ -version: 2 -distribution_spec: - description: Use AWS Bedrock for running LLM inference and safety - providers: - inference: - - remote::bedrock - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - remote::bedrock - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/bedrock/doc_template.md b/llama_stack/templates/bedrock/doc_template.md deleted file mode 100644 index e93bb92f2..000000000 --- a/llama_stack/templates/bedrock/doc_template.md +++ /dev/null @@ -1,73 +0,0 @@ -# Bedrock Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations: - -{{ providers_table }} - - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -### Prerequisite: API Keys - -Make sure you have access to a AWS Bedrock API Key. You can get one by visiting [AWS Bedrock](https://aws.amazon.com/bedrock/). - - -## Running Llama Stack with AWS Bedrock - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \ - --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION -``` - -### Via Conda - -```bash -llama stack build --template {{ name }} --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ - --env AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - --env AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN \ - --env AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION -``` diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml deleted file mode 100644 index 61bc83f02..000000000 --- a/llama_stack/templates/bedrock/run.yaml +++ /dev/null @@ -1,147 +0,0 @@ -version: 2 -image_name: bedrock -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: bedrock - provider_type: remote::bedrock - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/faiss_store.db - safety: - - provider_id: bedrock - provider_type: remote::bedrock - config: {} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/bedrock}/inference_store.db -models: -- metadata: {} - model_id: meta.llama3-1-8b-instruct-v1:0 - provider_id: bedrock - provider_model_id: meta.llama3-1-8b-instruct-v1:0 - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: bedrock - provider_model_id: meta.llama3-1-8b-instruct-v1:0 - model_type: llm -- metadata: {} - model_id: meta.llama3-1-70b-instruct-v1:0 - provider_id: bedrock - provider_model_id: meta.llama3-1-70b-instruct-v1:0 - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: bedrock - provider_model_id: meta.llama3-1-70b-instruct-v1:0 - model_type: llm -- metadata: {} - model_id: meta.llama3-1-405b-instruct-v1:0 - provider_id: bedrock - provider_model_id: meta.llama3-1-405b-instruct-v1:0 - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: bedrock - provider_model_id: meta.llama3-1-405b-instruct-v1:0 - model_type: llm -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/cerebras/__init__.py b/llama_stack/templates/cerebras/__init__.py deleted file mode 100644 index 9f9929b52..000000000 --- a/llama_stack/templates/cerebras/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .cerebras import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/cerebras/build.yaml b/llama_stack/templates/cerebras/build.yaml deleted file mode 100644 index ecd0ac418..000000000 --- a/llama_stack/templates/cerebras/build.yaml +++ /dev/null @@ -1,34 +0,0 @@ -version: 2 -distribution_spec: - description: Use Cerebras for running LLM inference - providers: - inference: - - remote::cerebras - - inline::sentence-transformers - safety: - - inline::llama-guard - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - agents: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - telemetry: - - inline::meta-reference - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/cerebras/cerebras.py b/llama_stack/templates/cerebras/cerebras.py deleted file mode 100644 index f341a88c1..000000000 --- a/llama_stack/templates/cerebras/cerebras.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig -from llama_stack.providers.remote.inference.cerebras.models import MODEL_ENTRIES -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::cerebras", "inline::sentence-transformers"], - "safety": ["inline::llama-guard"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "agents": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "telemetry": ["inline::meta-reference"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - ], - } - - name = "cerebras" - inference_provider = Provider( - provider_id="cerebras", - provider_type="remote::cerebras", - config=CerebrasImplConfig.sample_run_config(), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - - available_models = { - "cerebras": MODEL_ENTRIES, - } - default_models = get_model_registry(available_models) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name="cerebras", - distro_type="self_hosted", - description="Use Cerebras for running LLM inference", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=default_models + [embedding_model], - default_shields=[], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "CEREBRAS_API_KEY": ( - "", - "Cerebras API Key", - ), - }, - ) diff --git a/llama_stack/templates/cerebras/doc_template.md b/llama_stack/templates/cerebras/doc_template.md deleted file mode 100644 index 5cae2b2da..000000000 --- a/llama_stack/templates/cerebras/doc_template.md +++ /dev/null @@ -1,61 +0,0 @@ -# Cerebras Distribution - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -### Prerequisite: API Keys - -Make sure you have access to a Cerebras API Key. You can get one by visiting [cloud.cerebras.ai](https://cloud.cerebras.ai/). - - -## Running Llama Stack with Cerebras - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template cerebras --image-type conda -llama stack run ./run.yaml \ - --port 8321 \ - --env CEREBRAS_API_KEY=$CEREBRAS_API_KEY -``` diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml deleted file mode 100644 index 9bd8fcc7c..000000000 --- a/llama_stack/templates/cerebras/run.yaml +++ /dev/null @@ -1,145 +0,0 @@ -version: 2 -image_name: cerebras -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: cerebras - provider_type: remote::cerebras - config: - base_url: https://api.cerebras.ai - api_key: ${env.CEREBRAS_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/faiss_store.db - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/responses_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/trace_store.db - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/cerebras}/inference_store.db -models: -- metadata: {} - model_id: llama3.1-8b - provider_id: cerebras - provider_model_id: llama3.1-8b - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: cerebras - provider_model_id: llama3.1-8b - model_type: llm -- metadata: {} - model_id: llama-3.3-70b - provider_id: cerebras - provider_model_id: llama-3.3-70b - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: cerebras - provider_model_id: llama-3.3-70b - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/ci-tests/__init__.py b/llama_stack/templates/ci-tests/__init__.py deleted file mode 100644 index b309587f5..000000000 --- a/llama_stack/templates/ci-tests/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .ci_tests import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/ci-tests/build.yaml b/llama_stack/templates/ci-tests/build.yaml deleted file mode 100644 index c061d0793..000000000 --- a/llama_stack/templates/ci-tests/build.yaml +++ /dev/null @@ -1,35 +0,0 @@ -version: 2 -distribution_spec: - description: Distribution for running e2e tests in CI - providers: - inference: - - remote::fireworks - - inline::sentence-transformers - vector_io: - - inline::sqlite-vec - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/ci-tests/ci_tests.py b/llama_stack/templates/ci-tests/ci_tests.py deleted file mode 100644 index 7de8069ae..000000000 --- a/llama_stack/templates/ci-tests/ci_tests.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( - SQLiteVectorIOConfig, -) -from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig -from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::fireworks", "inline::sentence-transformers"], - "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - name = "ci-tests" - inference_provider = Provider( - provider_id="fireworks", - provider_type="remote::fireworks", - config=FireworksImplConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="sqlite-vec", - provider_type="inline::sqlite-vec", - config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - available_models = { - "fireworks": MODEL_ENTRIES, - } - default_models = get_model_registry(available_models) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Distribution for running e2e tests in CI", - container_image=None, - template_path=None, - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=default_models + [embedding_model], - default_tool_groups=default_tool_groups, - default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "FIREWORKS_API_KEY": ( - "", - "Fireworks API Key", - ), - }, - ) diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml deleted file mode 100644 index 4b7de1c0c..000000000 --- a/llama_stack/templates/ci-tests/run.yaml +++ /dev/null @@ -1,243 +0,0 @@ -version: 2 -image_name: ci-tests -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference/v1 - api_key: ${env.FIREWORKS_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: sqlite-vec - provider_type: inline::sqlite-vec - config: - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/sqlite_vec.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ci-tests}/inference_store.db -models: -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-8b - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-8b - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-8B - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-8b - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-11b-vision - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama4-scout-instruct-basic - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - model_type: llm -- metadata: - embedding_dimension: 768 - context_length: 8192 - model_id: nomic-ai/nomic-embed-text-v1.5 - provider_id: fireworks - provider_model_id: nomic-ai/nomic-embed-text-v1.5 - model_type: embedding -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/dell/__init__.py b/llama_stack/templates/dell/__init__.py deleted file mode 100644 index 143add56e..000000000 --- a/llama_stack/templates/dell/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .dell import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/dell/build.yaml b/llama_stack/templates/dell/build.yaml deleted file mode 100644 index ff8d58a08..000000000 --- a/llama_stack/templates/dell/build.yaml +++ /dev/null @@ -1,35 +0,0 @@ -version: 2 -distribution_spec: - description: Dell's distribution of Llama Stack. TGI inference via Dell's custom - container - providers: - inference: - - remote::tgi - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/dell/dell.py b/llama_stack/templates/dell/dell.py deleted file mode 100644 index 5a6f52a89..000000000 --- a/llama_stack/templates/dell/dell.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::tgi", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - ], - } - name = "dell" - inference_provider = Provider( - provider_id="tgi0", - provider_type="remote::tgi", - config={ - "url": "${env.DEH_URL}", - }, - ) - safety_inference_provider = Provider( - provider_id="tgi1", - provider_type="remote::tgi", - config={ - "url": "${env.DEH_SAFETY_URL}", - }, - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - chromadb_provider = Provider( - provider_id="chromadb", - provider_type="remote::chromadb", - config={ - "url": "${env.CHROMA_URL}", - }, - ) - - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="tgi0", - ) - safety_model = ModelInput( - model_id="${env.SAFETY_MODEL}", - provider_id="tgi1", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="brave-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Dell's distribution of Llama Stack. TGI inference via Dell's custom container", - container_image=None, - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [chromadb_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - safety_inference_provider, - embedding_provider, - ], - "vector_io": [chromadb_provider], - }, - default_models=[inference_model, safety_model, embedding_model], - default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "DEH_URL": ( - "http://0.0.0.0:8181", - "URL for the Dell inference server", - ), - "DEH_SAFETY_URL": ( - "http://0.0.0.0:8282", - "URL for the Dell safety inference server", - ), - "CHROMA_URL": ( - "http://localhost:6601", - "URL for the Chroma server", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model loaded into the TGI server", - ), - "SAFETY_MODEL": ( - "meta-llama/Llama-Guard-3-1B", - "Name of the safety (Llama-Guard) model to use", - ), - }, - ) diff --git a/llama_stack/templates/dell/doc_template.md b/llama_stack/templates/dell/doc_template.md deleted file mode 100644 index 6bdd7f81c..000000000 --- a/llama_stack/templates/dell/doc_template.md +++ /dev/null @@ -1,178 +0,0 @@ ---- -orphan: true ---- - -# Dell Distribution of Llama Stack - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference. - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - - -## Setting up Inference server using Dell Enterprise Hub's custom TGI container. - -NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified. - -```bash -export INFERENCE_PORT=8181 -export DEH_URL=http://0.0.0.0:$INFERENCE_PORT -export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct -export CHROMADB_HOST=localhost -export CHROMADB_PORT=6601 -export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT -export CUDA_VISIBLE_DEVICES=0 -export LLAMA_STACK_PORT=8321 - -docker run --rm -it \ - --pull always \ - --network host \ - -v $HOME/.cache/huggingface:/data \ - -e HF_TOKEN=$HF_TOKEN \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - --gpus $CUDA_VISIBLE_DEVICES \ - ghcr.io/huggingface/text-generation-inference \ - --dtype bfloat16 \ - --usage-stats off \ - --sharded false \ - --cuda-memory-fraction 0.7 \ - --model-id $INFERENCE_MODEL \ - --port $INFERENCE_PORT --hostname 0.0.0.0 -``` - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_INFERENCE_PORT=8282 -export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export CUDA_VISIBLE_DEVICES=1 - -docker run --rm -it \ - --pull always \ - --network host \ - -v $HOME/.cache/huggingface:/data \ - -e HF_TOKEN=$HF_TOKEN \ - -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \ - --gpus $CUDA_VISIBLE_DEVICES \ - ghcr.io/huggingface/text-generation-inference \ - --dtype bfloat16 \ - --usage-stats off \ - --sharded false \ - --cuda-memory-fraction 0.7 \ - --model-id $SAFETY_MODEL \ - --hostname 0.0.0.0 \ - --port $SAFETY_INFERENCE_PORT -``` - -## Dell distribution relies on ChromaDB for vector database usage - -You can start a chroma-db easily using docker. -```bash -# This is where the indices are persisted -mkdir -p $HOME/chromadb - -podman run --rm -it \ - --network host \ - --name chromadb \ - -v $HOME/chromadb:/chroma/chroma \ - -e IS_PERSISTENT=TRUE \ - chromadb/chroma:latest \ - --port $CHROMADB_PORT \ - --host $CHROMADB_HOST -``` - -## Running Llama Stack - -Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -docker run -it \ - --pull always \ - --network host \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v $HOME/.llama:/root/.llama \ - # NOTE: mount the llama-stack directory if testing local changes else not needed - -v /home/hjshah/git/llama-stack:/app/llama-stack-source \ - # localhost/distribution-dell:dev if building / testing locally - llamastack/distribution-{{ name }}\ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env DEH_URL=$DEH_URL \ - --env CHROMA_URL=$CHROMA_URL - -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -export SAFETY_INFERENCE_PORT=8282 -export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B - -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v $HOME/.llama:/root/.llama \ - -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env DEH_URL=$DEH_URL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env DEH_SAFETY_URL=$DEH_SAFETY_URL \ - --env CHROMA_URL=$CHROMA_URL -``` - -### Via Conda - -Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available. - -```bash -llama stack build --template {{ name }} --image-type conda -llama stack run {{ name }} - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env DEH_URL=$DEH_URL \ - --env CHROMA_URL=$CHROMA_URL -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -llama stack run ./run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env DEH_URL=$DEH_URL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env DEH_SAFETY_URL=$DEH_SAFETY_URL \ - --env CHROMA_URL=$CHROMA_URL -``` diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml deleted file mode 100644 index 7f1d0a8c0..000000000 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ /dev/null @@ -1,134 +0,0 @@ -version: 2 -image_name: dell -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: ${env.DEH_URL} - - provider_id: tgi1 - provider_type: remote::tgi - config: - url: ${env.DEH_SAFETY_URL} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: chromadb - provider_type: remote::chromadb - config: - url: ${env.CHROMA_URL} - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: tgi0 - model_type: llm -- metadata: {} - model_id: ${env.SAFETY_MODEL} - provider_id: tgi1 - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: ${env.SAFETY_MODEL} -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: brave-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml deleted file mode 100644 index 310f3cc20..000000000 --- a/llama_stack/templates/dell/run.yaml +++ /dev/null @@ -1,125 +0,0 @@ -version: 2 -image_name: dell -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: tgi0 - provider_type: remote::tgi - config: - url: ${env.DEH_URL} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: chromadb - provider_type: remote::chromadb - config: - url: ${env.CHROMA_URL} - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/dell}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: tgi0 - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: brave-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/experimental-post-training/build.yaml b/llama_stack/templates/experimental-post-training/build.yaml deleted file mode 100644 index 55cd189c6..000000000 --- a/llama_stack/templates/experimental-post-training/build.yaml +++ /dev/null @@ -1,30 +0,0 @@ -version: '2' -name: experimental-post-training -distribution_spec: - description: Experimental template for post training - container_image: null - providers: - inference: - - inline::meta-reference - - remote::ollama - eval: - - inline::meta-reference - scoring: - - inline::basic - - inline::braintrust - post_training: - - inline::huggingface - datasetio: - - inline::localfs - - remote::huggingface - telemetry: - - inline::meta-reference - agents: - - inline::meta-reference - safety: - - inline::llama-guard - vector_io: - - inline::faiss - tool_runtime: - - remote::brave-search -image_type: conda diff --git a/llama_stack/templates/experimental-post-training/run.yaml b/llama_stack/templates/experimental-post-training/run.yaml deleted file mode 100644 index a74aa3647..000000000 --- a/llama_stack/templates/experimental-post-training/run.yaml +++ /dev/null @@ -1,107 +0,0 @@ -version: '2' -image_name: experimental-post-training -container_image: null -conda_env: experimental-post-training -apis: -- agents -- datasetio -- eval -- inference -- vector_io -- safety -- scoring -- telemetry -- post_training -- tool_runtime -providers: - inference: - - provider_id: meta-reference-inference - provider_type: inline::meta-reference - config: - max_seq_len: 4096 - checkpoint_dir: null - create_distributed_process_group: False - - provider_id: ollama - provider_type: remote::ollama - config: - url: ${env.OLLAMA_URL:=http://localhost:11434} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - datasetio: - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/localfs_datasetio.db - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/huggingface}/huggingface_datasetio.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: {} - post_training: - - provider_id: huggingface - provider_type: inline::huggingface - config: - checkpoint_format: huggingface - distributed_backend: null - device: cpu - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/agents_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/faiss_store.db - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - -metadata_store: - namespace: null - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/experimental-post-training}/registry.db -models: [] -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] diff --git a/llama_stack/templates/fireworks/__init__.py b/llama_stack/templates/fireworks/__init__.py deleted file mode 100644 index 1d85c66db..000000000 --- a/llama_stack/templates/fireworks/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .fireworks import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml deleted file mode 100644 index eb08c1d43..000000000 --- a/llama_stack/templates/fireworks/build.yaml +++ /dev/null @@ -1,38 +0,0 @@ -version: 2 -distribution_spec: - description: Use Fireworks.AI for running LLM inference - providers: - inference: - - remote::fireworks - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - files: - - inline::localfs - tool_runtime: - - remote::brave-search - - remote::tavily-search - - remote::wolfram-alpha - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/fireworks/doc_template.md b/llama_stack/templates/fireworks/doc_template.md deleted file mode 100644 index ba0205db0..000000000 --- a/llama_stack/templates/fireworks/doc_template.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -orphan: true ---- -# Fireworks Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -### Prerequisite: API Keys - -Make sure you have access to a Fireworks API Key. You can get one by visiting [fireworks.ai](https://fireworks.ai/). - - -## Running Llama Stack with Fireworks - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template fireworks --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env FIREWORKS_API_KEY=$FIREWORKS_API_KEY -``` diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py deleted file mode 100644 index ad29c648f..000000000 --- a/llama_stack/templates/fireworks/fireworks.py +++ /dev/null @@ -1,177 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig -from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::fireworks", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "files": ["inline::localfs"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "remote::wolfram-alpha", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - - name = "fireworks" - - inference_provider = Provider( - provider_id="fireworks", - provider_type="remote::fireworks", - config=FireworksImplConfig.sample_run_config(), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - files_provider = Provider( - provider_id="meta-reference-files", - provider_type="inline::localfs", - config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - available_models = { - "fireworks": MODEL_ENTRIES, - } - default_models = get_model_registry(available_models) - - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::wolfram_alpha", - provider_id="wolfram-alpha", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use Fireworks.AI for running LLM inference", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - "files": [files_provider], - }, - default_models=default_models + [embedding_model], - default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - embedding_provider, - ], - "vector_io": [vector_io_provider], - "files": [files_provider], - "safety": [ - Provider( - provider_id="llama-guard", - provider_type="inline::llama-guard", - config={}, - ), - Provider( - provider_id="llama-guard-vision", - provider_type="inline::llama-guard", - config={}, - ), - Provider( - provider_id="code-scanner", - provider_type="inline::code-scanner", - config={}, - ), - ], - }, - default_models=[ - *default_models, - embedding_model, - ], - default_shields=[ - ShieldInput( - shield_id="meta-llama/Llama-Guard-3-8B", - provider_id="llama-guard", - ), - ShieldInput( - shield_id="meta-llama/Llama-Guard-3-11B-Vision", - provider_id="llama-guard-vision", - ), - ShieldInput( - shield_id="CodeScanner", - provider_id="code-scanner", - ), - ], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "FIREWORKS_API_KEY": ( - "", - "Fireworks.AI API Key", - ), - }, - ) diff --git a/llama_stack/templates/fireworks/remote-hosted-report.md b/llama_stack/templates/fireworks/remote-hosted-report.md deleted file mode 100644 index 2f3c882b7..000000000 --- a/llama_stack/templates/fireworks/remote-hosted-report.md +++ /dev/null @@ -1,45 +0,0 @@ -# Report for fireworks distribution - -## Supported Models -| Model Descriptor | fireworks | -|:---|:---| -| meta-llama/Llama-3-8B-Instruct | ❌ | -| meta-llama/Llama-3-70B-Instruct | ❌ | -| meta-llama/Llama-3.1-8B-Instruct | ❌ | -| meta-llama/Llama-3.1-70B-Instruct | ❌ | -| meta-llama/Llama-3.1-405B-Instruct-FP8 | ❌ | -| meta-llama/Llama-3.2-1B-Instruct | ❌ | -| meta-llama/Llama-3.2-3B-Instruct | ❌ | -| meta-llama/Llama-3.2-11B-Vision-Instruct | ❌ | -| meta-llama/Llama-3.2-90B-Vision-Instruct | ❌ | -| meta-llama/Llama-3.3-70B-Instruct | ❌ | -| meta-llama/Llama-Guard-3-11B-Vision | ❌ | -| meta-llama/Llama-Guard-3-1B | ❌ | -| meta-llama/Llama-Guard-3-8B | ❌ | -| meta-llama/Llama-Guard-2-8B | ❌ | - -## Inference -| Model | API | Capability | Test | Status | -|:----- |:-----|:-----|:-----|:-----| -| Text | /chat_completion | streaming | test_text_chat_completion_streaming | ❌ | -| Vision | /chat_completion | streaming | test_image_chat_completion_streaming | ❌ | -| Vision | /chat_completion | non_streaming | test_image_chat_completion_non_streaming | ❌ | -| Text | /chat_completion | non_streaming | test_text_chat_completion_non_streaming | ❌ | -| Text | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_streaming | ❌ | -| Text | /chat_completion | tool_calling | test_text_chat_completion_with_tool_calling_and_non_streaming | ❌ | -| Text | /completion | streaming | test_text_completion_streaming | ❌ | -| Text | /completion | non_streaming | test_text_completion_non_streaming | ❌ | -| Text | /completion | structured_output | test_text_completion_structured_output | ❌ | - -## Memory: -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| /insert, /query | inline | test_memory_bank_insert_inline_and_query | ❌ | -| /insert, /query | url | test_memory_bank_insert_from_url_and_query | ❌ | - -## Agents -| API | Capability | Test | Status | -|:-----|:-----|:-----|:-----| -| create_agent_turn | rag | test_rag_agent | ❌ | -| create_agent_turn | custom_tool | test_custom_tool | ❌ | -| create_agent_turn | code_execution | test_code_execution | ❌ | diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml deleted file mode 100644 index 6265f5cae..000000000 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ /dev/null @@ -1,271 +0,0 @@ -version: 2 -image_name: fireworks -apis: -- agents -- datasetio -- eval -- files -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference/v1 - api_key: ${env.FIREWORKS_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: {} - - provider_id: llama-guard-vision - provider_type: inline::llama-guard - config: {} - - provider_id: code-scanner - provider_type: inline::code-scanner - config: {} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - files: - - provider_id: meta-reference-files - provider_type: inline::localfs - config: - storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/fireworks/files} - metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/files_metadata.db - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/inference_store.db -models: -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-8b - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-8b - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-8B - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-8b - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-11b-vision - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama4-scout-instruct-basic - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - model_type: llm -- metadata: - embedding_dimension: 768 - context_length: 8192 - model_id: nomic-ai/nomic-embed-text-v1.5 - provider_id: fireworks - provider_model_id: nomic-ai/nomic-embed-text-v1.5 - model_type: embedding -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B - provider_id: llama-guard -- shield_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: llama-guard-vision -- shield_id: CodeScanner - provider_id: code-scanner -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml deleted file mode 100644 index e10404e92..000000000 --- a/llama_stack/templates/fireworks/run.yaml +++ /dev/null @@ -1,261 +0,0 @@ -version: 2 -image_name: fireworks -apis: -- agents -- datasetio -- eval -- files -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: fireworks - provider_type: remote::fireworks - config: - url: https://api.fireworks.ai/inference/v1 - api_key: ${env.FIREWORKS_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - files: - - provider_id: meta-reference-files - provider_type: inline::localfs - config: - storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/fireworks/files} - metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/files_metadata.db - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/fireworks}/inference_store.db -models: -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-8b - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-8b - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-8B - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-8b - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-11b-vision - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama4-scout-instruct-basic - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic - model_type: llm -- metadata: {} - model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: fireworks - provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - model_type: llm -- metadata: - embedding_dimension: 768 - context_length: 8192 - model_id: nomic-ai/nomic-embed-text-v1.5 - provider_id: fireworks - provider_model_id: nomic-ai/nomic-embed-text-v1.5 - model_type: embedding -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/groq/__init__.py b/llama_stack/templates/groq/__init__.py deleted file mode 100644 index 02a39601d..000000000 --- a/llama_stack/templates/groq/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .groq import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/groq/build.yaml b/llama_stack/templates/groq/build.yaml deleted file mode 100644 index 7e50a899f..000000000 --- a/llama_stack/templates/groq/build.yaml +++ /dev/null @@ -1,31 +0,0 @@ -version: 2 -distribution_spec: - description: Use Groq for running LLM inference - providers: - inference: - - remote::groq - vector_io: - - inline::faiss - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/groq/doc_template.md b/llama_stack/templates/groq/doc_template.md deleted file mode 100644 index 80945ff9c..000000000 --- a/llama_stack/templates/groq/doc_template.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -orphan: true ---- -# Groq Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -### Prerequisite: API Keys - -Make sure you have access to a Groq API Key. You can get one by visiting [Groq](https://api.groq.com/). - - -## Running Llama Stack with Groq - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env GROQ_API_KEY=$GROQ_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template groq --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env GROQ_API_KEY=$GROQ_API_KEY -``` diff --git a/llama_stack/templates/groq/groq.py b/llama_stack/templates/groq/groq.py deleted file mode 100644 index 9e166a288..000000000 --- a/llama_stack/templates/groq/groq.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.remote.inference.groq import GroqConfig -from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::groq"], - "vector_io": ["inline::faiss"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - ], - } - name = "groq" - - inference_provider = Provider( - provider_id=name, - provider_type=f"remote::{name}", - config=GroqConfig.sample_run_config(), - ) - - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - - available_models = { - "groq": MODEL_ENTRIES, - } - default_models = get_model_registry(available_models) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use Groq for running LLM inference", - docker_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - }, - default_models=default_models + [embedding_model], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMASTACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "GROQ_API_KEY": ( - "", - "Groq API Key", - ), - }, - ) diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml deleted file mode 100644 index 21c8f7e0f..000000000 --- a/llama_stack/templates/groq/run.yaml +++ /dev/null @@ -1,210 +0,0 @@ -version: 2 -image_name: groq -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: groq - provider_type: remote::groq - config: - url: https://api.groq.com - api_key: ${env.GROQ_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/groq}/inference_store.db -models: -- metadata: {} - model_id: groq/llama3-8b-8192 - provider_id: groq - provider_model_id: groq/llama3-8b-8192 - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-3.1-8B-Instruct - provider_id: groq - provider_model_id: groq/llama3-8b-8192 - model_type: llm -- metadata: {} - model_id: groq/llama-3.1-8b-instant - provider_id: groq - provider_model_id: groq/llama-3.1-8b-instant - model_type: llm -- metadata: {} - model_id: groq/llama3-70b-8192 - provider_id: groq - provider_model_id: groq/llama3-70b-8192 - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-3-70B-Instruct - provider_id: groq - provider_model_id: groq/llama3-70b-8192 - model_type: llm -- metadata: {} - model_id: groq/llama-3.3-70b-versatile - provider_id: groq - provider_model_id: groq/llama-3.3-70b-versatile - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-3.3-70B-Instruct - provider_id: groq - provider_model_id: groq/llama-3.3-70b-versatile - model_type: llm -- metadata: {} - model_id: groq/llama-3.2-3b-preview - provider_id: groq - provider_model_id: groq/llama-3.2-3b-preview - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-3.2-3B-Instruct - provider_id: groq - provider_model_id: groq/llama-3.2-3b-preview - model_type: llm -- metadata: {} - model_id: groq/llama-4-scout-17b-16e-instruct - provider_id: groq - provider_model_id: groq/llama-4-scout-17b-16e-instruct - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: groq - provider_model_id: groq/llama-4-scout-17b-16e-instruct - model_type: llm -- metadata: {} - model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct - provider_id: groq - provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: groq - provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct - model_type: llm -- metadata: {} - model_id: groq/llama-4-maverick-17b-128e-instruct - provider_id: groq - provider_model_id: groq/llama-4-maverick-17b-128e-instruct - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: groq - provider_model_id: groq/llama-4-maverick-17b-128e-instruct - model_type: llm -- metadata: {} - model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct - provider_id: groq - provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct - model_type: llm -- metadata: {} - model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: groq - provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/hf-endpoint/__init__.py b/llama_stack/templates/hf-endpoint/__init__.py deleted file mode 100644 index f2c00e3bf..000000000 --- a/llama_stack/templates/hf-endpoint/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .hf_endpoint import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml deleted file mode 100644 index 9fca9ac22..000000000 --- a/llama_stack/templates/hf-endpoint/build.yaml +++ /dev/null @@ -1,34 +0,0 @@ -version: 2 -distribution_spec: - description: Use (an external) Hugging Face Inference Endpoint for running LLM inference - providers: - inference: - - remote::hf::endpoint - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py deleted file mode 100644 index 23887469f..000000000 --- a/llama_stack/templates/hf-endpoint/hf_endpoint.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::hf::endpoint"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - name = "hf-endpoint" - inference_provider = Provider( - provider_id="hf-endpoint", - provider_type="remote::hf::endpoint", - config=InferenceEndpointImplConfig.sample_run_config(), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="hf-endpoint", - ) - safety_model = ModelInput( - model_id="${env.SAFETY_MODEL}", - provider_id="hf-endpoint-safety", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", - container_image=None, - template_path=None, - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - embedding_provider, - Provider( - provider_id="hf-endpoint-safety", - provider_type="remote::hf::endpoint", - config=InferenceEndpointImplConfig.sample_run_config( - endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}", - ), - ), - ], - "vector_io": [vector_io_provider], - }, - default_models=[ - inference_model, - safety_model, - embedding_model, - ], - default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "HF_API_TOKEN": ( - "hf_...", - "Hugging Face API token", - ), - "INFERENCE_ENDPOINT_NAME": ( - "", - "HF Inference endpoint name for the main inference model", - ), - "SAFETY_INFERENCE_ENDPOINT_NAME": ( - "", - "HF Inference endpoint for the safety model", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model served by the HF Inference Endpoint", - ), - "SAFETY_MODEL": ( - "meta-llama/Llama-Guard-3-1B", - "Safety model served by the HF Inference Endpoint", - ), - }, - ) diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml deleted file mode 100644 index 2ae1d7685..000000000 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ /dev/null @@ -1,142 +0,0 @@ -version: 2 -image_name: hf-endpoint -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: hf-endpoint - provider_type: remote::hf::endpoint - config: - endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} - api_token: ${env.HF_API_TOKEN} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - - provider_id: hf-endpoint-safety - provider_type: remote::hf::endpoint - config: - endpoint_name: ${env.SAFETY_INFERENCE_ENDPOINT_NAME} - api_token: ${env.HF_API_TOKEN} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: hf-endpoint - model_type: llm -- metadata: {} - model_id: ${env.SAFETY_MODEL} - provider_id: hf-endpoint-safety - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: ${env.SAFETY_MODEL} -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml deleted file mode 100644 index 3ec5ae9c1..000000000 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ /dev/null @@ -1,132 +0,0 @@ -version: 2 -image_name: hf-endpoint -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: hf-endpoint - provider_type: remote::hf::endpoint - config: - endpoint_name: ${env.INFERENCE_ENDPOINT_NAME} - api_token: ${env.HF_API_TOKEN} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-endpoint}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: hf-endpoint - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/hf-serverless/__init__.py b/llama_stack/templates/hf-serverless/__init__.py deleted file mode 100644 index a5f1ab54a..000000000 --- a/llama_stack/templates/hf-serverless/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .hf_serverless import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml deleted file mode 100644 index 214245116..000000000 --- a/llama_stack/templates/hf-serverless/build.yaml +++ /dev/null @@ -1,35 +0,0 @@ -version: 2 -distribution_spec: - description: Use (an external) Hugging Face Inference Endpoint for running LLM inference - providers: - inference: - - remote::hf::serverless - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py deleted file mode 100644 index c58c0921d..000000000 --- a/llama_stack/templates/hf-serverless/hf_serverless.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::hf::serverless", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - - name = "hf-serverless" - inference_provider = Provider( - provider_id="hf-serverless", - provider_type="remote::hf::serverless", - config=InferenceAPIImplConfig.sample_run_config(), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="hf-serverless", - ) - safety_model = ModelInput( - model_id="${env.SAFETY_MODEL}", - provider_id="hf-serverless-safety", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", - container_image=None, - template_path=None, - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - embedding_provider, - Provider( - provider_id="hf-serverless-safety", - provider_type="remote::hf::serverless", - config=InferenceAPIImplConfig.sample_run_config( - repo="${env.SAFETY_MODEL}", - ), - ), - ], - "vector_io": [vector_io_provider], - }, - default_models=[ - inference_model, - safety_model, - embedding_model, - ], - default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "HF_API_TOKEN": ( - "hf_...", - "Hugging Face API token", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model to be served by the HF Serverless endpoint", - ), - "SAFETY_MODEL": ( - "meta-llama/Llama-Guard-3-1B", - "Safety model to be served by the HF Serverless endpoint", - ), - }, - ) diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml deleted file mode 100644 index 3871b77e7..000000000 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ /dev/null @@ -1,142 +0,0 @@ -version: 2 -image_name: hf-serverless -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: hf-serverless - provider_type: remote::hf::serverless - config: - huggingface_repo: ${env.INFERENCE_MODEL} - api_token: ${env.HF_API_TOKEN} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - - provider_id: hf-serverless-safety - provider_type: remote::hf::serverless - config: - huggingface_repo: ${env.SAFETY_MODEL} - api_token: ${env.HF_API_TOKEN} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: hf-serverless - model_type: llm -- metadata: {} - model_id: ${env.SAFETY_MODEL} - provider_id: hf-serverless-safety - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: ${env.SAFETY_MODEL} -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml deleted file mode 100644 index 0a5b59400..000000000 --- a/llama_stack/templates/hf-serverless/run.yaml +++ /dev/null @@ -1,132 +0,0 @@ -version: 2 -image_name: hf-serverless -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: hf-serverless - provider_type: remote::hf::serverless - config: - huggingface_repo: ${env.INFERENCE_MODEL} - api_token: ${env.HF_API_TOKEN} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/hf-serverless}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: hf-serverless - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/llama_api/__init__.py b/llama_stack/templates/llama_api/__init__.py deleted file mode 100644 index 57cc75730..000000000 --- a/llama_stack/templates/llama_api/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .llama_api import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/llama_api/build.yaml b/llama_stack/templates/llama_api/build.yaml deleted file mode 100644 index 44a42594a..000000000 --- a/llama_stack/templates/llama_api/build.yaml +++ /dev/null @@ -1,35 +0,0 @@ -version: 2 -distribution_spec: - description: Distribution for running e2e tests in CI - providers: - inference: - - remote::llama-openai-compat - - inline::sentence-transformers - vector_io: - - inline::sqlite-vec - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/llama_api/llama_api.py b/llama_stack/templates/llama_api/llama_api.py deleted file mode 100644 index 7631781af..000000000 --- a/llama_stack/templates/llama_api/llama_api.py +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( - SQLiteVectorIOConfig, -) -from llama_stack.providers.remote.inference.llama_openai_compat.config import ( - LlamaCompatConfig, -) -from llama_stack.providers.remote.inference.llama_openai_compat.models import ( - MODEL_ENTRIES as LLLAMA_MODEL_ENTRIES, -) -from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig -from llama_stack.providers.remote.vector_io.pgvector.config import ( - PGVectorVectorIOConfig, -) -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_inference_providers() -> tuple[list[Provider], list[ModelInput]]: - # in this template, we allow each API key to be optional - providers = [ - ( - "llama-openai-compat", - LLLAMA_MODEL_ENTRIES, - LlamaCompatConfig.sample_run_config(api_key="${env.LLAMA_API_KEY:+}"), - ), - ] - inference_providers = [] - available_models = {} - for provider_id, model_entries, config in providers: - inference_providers.append( - Provider( - provider_id=provider_id, - provider_type=f"remote::{provider_id}", - config=config, - ) - ) - available_models[provider_id] = model_entries - return inference_providers, available_models - - -def get_distribution_template() -> DistributionTemplate: - inference_providers, available_models = get_inference_providers() - providers = { - "inference": ([p.provider_type for p in inference_providers] + ["inline::sentence-transformers"]), - "vector_io": ["inline::sqlite-vec", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - name = "llama_api" - - vector_io_providers = [ - Provider( - provider_id="sqlite-vec", - provider_type="inline::sqlite-vec", - config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ), - Provider( - provider_id="${env.ENABLE_CHROMADB:+chromadb}", - provider_type="remote::chromadb", - config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:+}"), - ), - Provider( - provider_id="${env.ENABLE_PGVECTOR:+pgvector}", - provider_type="remote::pgvector", - config=PGVectorVectorIOConfig.sample_run_config( - db="${env.PGVECTOR_DB:+}", - user="${env.PGVECTOR_USER:+}", - password="${env.PGVECTOR_PASSWORD:+}", - ), - ), - ] - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id=embedding_provider.provider_id, - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - - default_models = get_model_registry(available_models) - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Distribution for running e2e tests in CI", - container_image=None, - template_path=None, - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": inference_providers + [embedding_provider], - "vector_io": vector_io_providers, - }, - default_models=default_models + [embedding_model], - default_tool_groups=default_tool_groups, - default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - }, - ) diff --git a/llama_stack/templates/llama_api/run.yaml b/llama_stack/templates/llama_api/run.yaml deleted file mode 100644 index b627ed2f1..000000000 --- a/llama_stack/templates/llama_api/run.yaml +++ /dev/null @@ -1,168 +0,0 @@ -version: 2 -image_name: llama_api -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: llama-openai-compat - provider_type: remote::llama-openai-compat - config: - openai_compat_api_base: https://api.llama.com/compat/v1/ - api_key: ${env.LLAMA_API_KEY:+} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: sqlite-vec - provider_type: inline::sqlite-vec - config: - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/sqlite_vec.db - - provider_id: ${env.ENABLE_CHROMADB:+chromadb} - provider_type: remote::chromadb - config: - url: ${env.CHROMADB_URL:+} - - provider_id: ${env.ENABLE_PGVECTOR:+pgvector} - provider_type: remote::pgvector - config: - host: ${env.PGVECTOR_HOST:=localhost} - port: ${env.PGVECTOR_PORT:=5432} - db: ${env.PGVECTOR_DB:+} - user: ${env.PGVECTOR_USER:+} - password: ${env.PGVECTOR_PASSWORD:+} - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/llama_api}/inference_store.db -models: -- metadata: {} - model_id: Llama-3.3-70B-Instruct - provider_id: llama-openai-compat - provider_model_id: Llama-3.3-70B-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: llama-openai-compat - provider_model_id: Llama-3.3-70B-Instruct - model_type: llm -- metadata: {} - model_id: Llama-4-Scout-17B-16E-Instruct-FP8 - provider_id: llama-openai-compat - provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8 - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: llama-openai-compat - provider_model_id: Llama-4-Scout-17B-16E-Instruct-FP8 - model_type: llm -- metadata: {} - model_id: Llama-4-Maverick-17B-128E-Instruct-FP8 - provider_id: llama-openai-compat - provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: llama-openai-compat - provider_model_id: Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/nvidia/__init__.py b/llama_stack/templates/nvidia/__init__.py deleted file mode 100644 index 24e2fbd21..000000000 --- a/llama_stack/templates/nvidia/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .nvidia import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml deleted file mode 100644 index 51685b2e3..000000000 --- a/llama_stack/templates/nvidia/build.yaml +++ /dev/null @@ -1,29 +0,0 @@ -version: 2 -distribution_spec: - description: Use NVIDIA NIM for running LLM inference, evaluation and safety - providers: - inference: - - remote::nvidia - vector_io: - - inline::faiss - safety: - - remote::nvidia - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - remote::nvidia - post_training: - - remote::nvidia - datasetio: - - inline::localfs - - remote::nvidia - scoring: - - inline::basic - tool_runtime: - - inline::rag-runtime -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/nvidia/doc_template.md b/llama_stack/templates/nvidia/doc_template.md deleted file mode 100644 index 3cb8245df..000000000 --- a/llama_stack/templates/nvidia/doc_template.md +++ /dev/null @@ -1,149 +0,0 @@ -# NVIDIA Distribution - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -## Prerequisites -### NVIDIA API Keys - -Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable. - -### Deploy NeMo Microservices Platform -The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform. - -## Supported Services -Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints. - -### Inference: NVIDIA NIM -NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs: - 1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key) - 2. Self-hosted: NVIDIA NIMs that run on your own infrastructure. - -The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment. - -### Datasetio API: NeMo Data Store -The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint. - -See the {repopath}`NVIDIA Datasetio docs::llama_stack/providers/remote/datasetio/nvidia/README.md` for supported features and example usage. - -### Eval API: NeMo Evaluator -The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint. - -See the {repopath}`NVIDIA Eval docs::llama_stack/providers/remote/eval/nvidia/README.md` for supported features and example usage. - -### Post-Training API: NeMo Customizer -The NeMo Customizer microservice supports fine-tuning models. You can reference {repopath}`this list of supported models::llama_stack/providers/remote/post_training/nvidia/models.py` that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint. - -See the {repopath}`NVIDIA Post-Training docs::llama_stack/providers/remote/post_training/nvidia/README.md` for supported features and example usage. - -### Safety API: NeMo Guardrails -The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint. - -See the {repopath}`NVIDIA Safety docs::llama_stack/providers/remote/safety/nvidia/README.md` for supported features and example usage. - -## Deploying models -In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`. - -Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart. -```sh -# URL to NeMo NIM Proxy service -export NEMO_URL="http://nemo.test" - -curl --location "$NEMO_URL/v1/deployment/model-deployments" \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ - "name": "llama-3.2-1b-instruct", - "namespace": "meta", - "config": { - "model": "meta/llama-3.2-1b-instruct", - "nim_deployment": { - "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct", - "image_tag": "1.8.3", - "pvc_size": "25Gi", - "gpu": 1, - "additional_envs": { - "NIM_GUIDED_DECODING_BACKEND": "fast_outlines" - } - } - } - }' -``` -This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference. - -You can also remove a deployed NIM to free up GPU resources, if needed. -```sh -export NEMO_URL="http://nemo.test" - -curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct" -``` - -## Running Llama Stack with NVIDIA - -You can do this via Conda or venv (build code), or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env NVIDIA_API_KEY=$NVIDIA_API_KEY -``` - -### Via Conda - -```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct -llama stack build --template nvidia --image-type conda -llama stack run ./run.yaml \ - --port 8321 \ - --env NVIDIA_API_KEY=$NVIDIA_API_KEY \ - --env INFERENCE_MODEL=$INFERENCE_MODEL -``` - -### Via venv - -If you've set up your local development environment, you can also build the image using your local virtual environment. - -```bash -INFERENCE_MODEL=meta-llama/Llama-3.1-8b-Instruct -llama stack build --template nvidia --image-type venv -llama stack run ./run.yaml \ - --port 8321 \ - --env NVIDIA_API_KEY=$NVIDIA_API_KEY \ - --env INFERENCE_MODEL=$INFERENCE_MODEL -``` - -## Example Notebooks -For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in {repopath}`docs/notebooks/nvidia`. diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py deleted file mode 100644 index 4eccfb25c..000000000 --- a/llama_stack/templates/nvidia/nvidia.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput -from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig -from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig -from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig -from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES -from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::nvidia"], - "vector_io": ["inline::faiss"], - "safety": ["remote::nvidia"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["remote::nvidia"], - "post_training": ["remote::nvidia"], - "datasetio": ["inline::localfs", "remote::nvidia"], - "scoring": ["inline::basic"], - "tool_runtime": ["inline::rag-runtime"], - } - - inference_provider = Provider( - provider_id="nvidia", - provider_type="remote::nvidia", - config=NVIDIAConfig.sample_run_config(), - ) - safety_provider = Provider( - provider_id="nvidia", - provider_type="remote::nvidia", - config=NVIDIASafetyConfig.sample_run_config(), - ) - datasetio_provider = Provider( - provider_id="nvidia", - provider_type="remote::nvidia", - config=NvidiaDatasetIOConfig.sample_run_config(), - ) - eval_provider = Provider( - provider_id="nvidia", - provider_type="remote::nvidia", - config=NVIDIAEvalConfig.sample_run_config(), - ) - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="nvidia", - ) - safety_model = ModelInput( - model_id="${env.SAFETY_MODEL}", - provider_id="nvidia", - ) - - available_models = { - "nvidia": MODEL_ENTRIES, - } - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - default_models = get_model_registry(available_models) - return DistributionTemplate( - name="nvidia", - distro_type="self_hosted", - description="Use NVIDIA NIM for running LLM inference, evaluation and safety", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider], - "datasetio": [datasetio_provider], - "eval": [eval_provider], - }, - default_models=default_models, - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - safety_provider, - ], - "eval": [eval_provider], - }, - default_models=[inference_model, safety_model], - default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}", provider_id="nvidia")], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "NVIDIA_API_KEY": ( - "", - "NVIDIA API Key", - ), - "NVIDIA_APPEND_API_VERSION": ( - "True", - "Whether to append the API version to the base_url", - ), - ## Nemo Customizer related variables - "NVIDIA_DATASET_NAMESPACE": ( - "default", - "NVIDIA Dataset Namespace", - ), - "NVIDIA_PROJECT_ID": ( - "test-project", - "NVIDIA Project ID", - ), - "NVIDIA_CUSTOMIZER_URL": ( - "https://customizer.api.nvidia.com", - "NVIDIA Customizer URL", - ), - "NVIDIA_OUTPUT_MODEL_DIR": ( - "test-example-model@v1", - "NVIDIA Output Model Directory", - ), - "GUARDRAILS_SERVICE_URL": ( - "http://0.0.0.0:7331", - "URL for the NeMo Guardrails Service", - ), - "NVIDIA_GUARDRAILS_CONFIG_ID": ( - "self-check", - "NVIDIA Guardrail Configuration ID", - ), - "NVIDIA_EVALUATOR_URL": ( - "http://0.0.0.0:7331", - "URL for the NeMo Evaluator Service", - ), - "INFERENCE_MODEL": ( - "Llama3.1-8B-Instruct", - "Inference model", - ), - "SAFETY_MODEL": ( - "meta/llama-3.1-8b-instruct", - "Name of the model to use for safety", - ), - }, - ) diff --git a/llama_stack/templates/nvidia/run-with-safety.yaml b/llama_stack/templates/nvidia/run-with-safety.yaml deleted file mode 100644 index 875fccc9d..000000000 --- a/llama_stack/templates/nvidia/run-with-safety.yaml +++ /dev/null @@ -1,121 +0,0 @@ -version: 2 -image_name: nvidia -apis: -- agents -- datasetio -- eval -- inference -- post_training -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: nvidia - provider_type: remote::nvidia - config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} - api_key: ${env.NVIDIA_API_KEY:+} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - - provider_id: nvidia - provider_type: remote::nvidia - config: - guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331} - config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/faiss_store.db - safety: - - provider_id: nvidia - provider_type: remote::nvidia - config: - guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331} - config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/trace_store.db - eval: - - provider_id: nvidia - provider_type: remote::nvidia - config: - evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331} - post_training: - - provider_id: nvidia - provider_type: remote::nvidia - config: - api_key: ${env.NVIDIA_API_KEY:+} - dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default} - project_id: ${env.NVIDIA_PROJECT_ID:=test-project} - customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test} - datasetio: - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/localfs_datasetio.db - - provider_id: nvidia - provider_type: remote::nvidia - config: - api_key: ${env.NVIDIA_API_KEY:+} - dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default} - project_id: ${env.NVIDIA_PROJECT_ID:=test-project} - datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test} - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - tool_runtime: - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: nvidia - model_type: llm -- metadata: {} - model_id: ${env.SAFETY_MODEL} - provider_id: nvidia - model_type: llm -shields: -- shield_id: ${env.SAFETY_MODEL} - provider_id: nvidia -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml deleted file mode 100644 index 4477d5244..000000000 --- a/llama_stack/templates/nvidia/run.yaml +++ /dev/null @@ -1,227 +0,0 @@ -version: 2 -image_name: nvidia -apis: -- agents -- datasetio -- eval -- inference -- post_training -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: nvidia - provider_type: remote::nvidia - config: - url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} - api_key: ${env.NVIDIA_API_KEY:+} - append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/faiss_store.db - safety: - - provider_id: nvidia - provider_type: remote::nvidia - config: - guardrails_service_url: ${env.GUARDRAILS_SERVICE_URL:=http://localhost:7331} - config_id: ${env.NVIDIA_GUARDRAILS_CONFIG_ID:=self-check} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/trace_store.db - eval: - - provider_id: nvidia - provider_type: remote::nvidia - config: - evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331} - post_training: - - provider_id: nvidia - provider_type: remote::nvidia - config: - api_key: ${env.NVIDIA_API_KEY:+} - dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default} - project_id: ${env.NVIDIA_PROJECT_ID:=test-project} - customizer_url: ${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test} - datasetio: - - provider_id: nvidia - provider_type: remote::nvidia - config: - api_key: ${env.NVIDIA_API_KEY:+} - dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default} - project_id: ${env.NVIDIA_PROJECT_ID:=test-project} - datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test} - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - tool_runtime: - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/nvidia}/inference_store.db -models: -- metadata: {} - model_id: meta/llama3-8b-instruct - provider_id: nvidia - provider_model_id: meta/llama3-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3-8B-Instruct - provider_id: nvidia - provider_model_id: meta/llama3-8b-instruct - model_type: llm -- metadata: {} - model_id: meta/llama3-70b-instruct - provider_id: nvidia - provider_model_id: meta/llama3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3-70B-Instruct - provider_id: nvidia - provider_model_id: meta/llama3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.1-8b-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: nvidia - provider_model_id: meta/llama-3.1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.1-70b-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.1-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: nvidia - provider_model_id: meta/llama-3.1-70b-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.1-405b-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.1-405b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: nvidia - provider_model_id: meta/llama-3.1-405b-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.2-1b-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-1b-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.2-3b-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-3b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-3b-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.2-11b-vision-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-11b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.2-90b-vision-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: nvidia - provider_model_id: meta/llama-3.2-90b-vision-instruct - model_type: llm -- metadata: {} - model_id: meta/llama-3.3-70b-instruct - provider_id: nvidia - provider_model_id: meta/llama-3.3-70b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: nvidia - provider_model_id: meta/llama-3.3-70b-instruct - model_type: llm -- metadata: - embedding_dimension: 2048 - context_length: 8192 - model_id: nvidia/llama-3.2-nv-embedqa-1b-v2 - provider_id: nvidia - provider_model_id: nvidia/llama-3.2-nv-embedqa-1b-v2 - model_type: embedding -- metadata: - embedding_dimension: 1024 - context_length: 512 - model_id: nvidia/nv-embedqa-e5-v5 - provider_id: nvidia - provider_model_id: nvidia/nv-embedqa-e5-v5 - model_type: embedding -- metadata: - embedding_dimension: 4096 - context_length: 512 - model_id: nvidia/nv-embedqa-mistral-7b-v2 - provider_id: nvidia - provider_model_id: nvidia/nv-embedqa-mistral-7b-v2 - model_type: embedding -- metadata: - embedding_dimension: 1024 - context_length: 512 - model_id: snowflake/arctic-embed-l - provider_id: nvidia - provider_model_id: snowflake/arctic-embed-l - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/ollama/__init__.py b/llama_stack/templates/ollama/__init__.py deleted file mode 100644 index 3a2c40f27..000000000 --- a/llama_stack/templates/ollama/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .ollama import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml deleted file mode 100644 index cbf4281a2..000000000 --- a/llama_stack/templates/ollama/build.yaml +++ /dev/null @@ -1,39 +0,0 @@ -version: 2 -distribution_spec: - description: Use (an external) Ollama server for running LLM inference - providers: - inference: - - remote::ollama - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - files: - - inline::localfs - post_training: - - inline::huggingface - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol - - remote::wolfram-alpha -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md deleted file mode 100644 index aaa65bab2..000000000 --- a/llama_stack/templates/ollama/doc_template.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -orphan: true ---- -# Ollama Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration. - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - - -## Setting up Ollama server - -Please check the [Ollama Documentation](https://github.com/ollama/ollama) on how to install and run Ollama. After installing Ollama, you need to run `ollama serve` to start the server. - -In order to load models, you can run: - -```bash -export INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct" - -# ollama names this model differently, and we must use the ollama name when loading the model -export OLLAMA_INFERENCE_MODEL="llama3.2:3b-instruct-fp16" -ollama run $OLLAMA_INFERENCE_MODEL --keepalive 60m -``` - -If you are using Llama Stack Safety / Shield APIs, you will also need to pull and run the safety model. - -```bash -export SAFETY_MODEL="meta-llama/Llama-Guard-3-1B" - -# ollama names this model differently, and we must use the ollama name when loading the model -export OLLAMA_SAFETY_MODEL="llama-guard3:1b" -ollama run $OLLAMA_SAFETY_MODEL --keepalive 60m -``` - -## Running Llama Stack - -Now you are ready to run Llama Stack with Ollama as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -export LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env OLLAMA_URL=http://host.docker.internal:11434 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - -v ./llama_stack/templates/ollama/run-with-safety.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env OLLAMA_URL=http://host.docker.internal:11434 -``` - -### Via Conda - -Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. - -```bash -export LLAMA_STACK_PORT=8321 - -llama stack build --template {{ name }} --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env OLLAMA_URL=http://localhost:11434 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -llama stack run ./run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env OLLAMA_URL=http://localhost:11434 -``` - - -### (Optional) Update Model Serving Configuration - -```{note} -Please check the [model_entries](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/models.py) for the supported Ollama models. -``` - -To serve a new model with `ollama` -```bash -ollama run -``` - -To make sure that the model is being served correctly, run `ollama ps` to get a list of models being served by ollama. -``` -$ ollama ps -NAME ID SIZE PROCESSOR UNTIL -llama3.2:3b-instruct-fp16 195a8c01d91e 8.6 GB 100% GPU 9 minutes from now -``` - -To verify that the model served by ollama is correctly connected to Llama Stack server -```bash -$ llama-stack-client models list - -Available Models - -┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━┓ -┃ model_type ┃ identifier ┃ provider_resource_id ┃ metadata ┃ provider_id ┃ -┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━┩ -│ llm │ meta-llama/Llama-3.2-3B-Instruct │ llama3.2:3b-instruct-fp16 │ │ ollama │ -└──────────────┴──────────────────────────────────────┴──────────────────────────────┴───────────┴─────────────┘ - -Total models: 1 -``` diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py deleted file mode 100644 index cba25296b..000000000 --- a/llama_stack/templates/ollama/ollama.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig -from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.ollama import OllamaImplConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::ollama"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "files": ["inline::localfs"], - "post_training": ["inline::huggingface"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - "remote::wolfram-alpha", - ], - } - name = "ollama" - inference_provider = Provider( - provider_id="ollama", - provider_type="remote::ollama", - config=OllamaImplConfig.sample_run_config(), - ) - vector_io_provider_faiss = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - files_provider = Provider( - provider_id="meta-reference-files", - provider_type="inline::localfs", - config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - posttraining_provider = Provider( - provider_id="huggingface", - provider_type="inline::huggingface", - config=HuggingFacePostTrainingConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="ollama", - ) - safety_model = ModelInput( - model_id="${env.SAFETY_MODEL}", - provider_id="ollama", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="ollama", - provider_model_id="all-minilm:latest", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ToolGroupInput( - toolgroup_id="builtin::wolfram_alpha", - provider_id="wolfram-alpha", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use (an external) Ollama server for running LLM inference", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider], - "vector_io": [vector_io_provider_faiss], - "files": [files_provider], - "post_training": [posttraining_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider], - "vector_io": [vector_io_provider_faiss], - "files": [files_provider], - "post_training": [posttraining_provider], - "safety": [ - Provider( - provider_id="llama-guard", - provider_type="inline::llama-guard", - config={}, - ), - Provider( - provider_id="code-scanner", - provider_type="inline::code-scanner", - config={}, - ), - ], - }, - default_models=[ - inference_model, - safety_model, - embedding_model, - ], - default_shields=[ - ShieldInput( - shield_id="${env.SAFETY_MODEL}", - provider_id="llama-guard", - ), - ShieldInput( - shield_id="CodeScanner", - provider_id="code-scanner", - ), - ], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "OLLAMA_URL": ( - "http://127.0.0.1:11434", - "URL of the Ollama server", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model loaded into the Ollama server", - ), - "SAFETY_MODEL": ( - "meta-llama/Llama-Guard-3-1B", - "Safety model loaded into the Ollama server", - ), - }, - ) diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml deleted file mode 100644 index 5e906a12c..000000000 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ /dev/null @@ -1,163 +0,0 @@ -version: 2 -image_name: ollama -apis: -- agents -- datasetio -- eval -- files -- inference -- post_training -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: ollama - provider_type: remote::ollama - config: - url: ${env.OLLAMA_URL:=http://localhost:11434} - raise_on_connect_error: true - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: {} - - provider_id: code-scanner - provider_type: inline::code-scanner - config: {} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - files: - - provider_id: meta-reference-files - provider_type: inline::localfs - config: - storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ollama/files} - metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/files_metadata.db - post_training: - - provider_id: huggingface - provider_type: inline::huggingface - config: - checkpoint_format: huggingface - distributed_backend: null - device: cpu - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: ollama - model_type: llm -- metadata: {} - model_id: ${env.SAFETY_MODEL} - provider_id: ollama - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: ollama - provider_model_id: all-minilm:latest - model_type: embedding -shields: -- shield_id: ${env.SAFETY_MODEL} - provider_id: llama-guard -- shield_id: CodeScanner - provider_id: code-scanner -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -server: - port: 8321 diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml deleted file mode 100644 index d2b4e3978..000000000 --- a/llama_stack/templates/ollama/run.yaml +++ /dev/null @@ -1,153 +0,0 @@ -version: 2 -image_name: ollama -apis: -- agents -- datasetio -- eval -- files -- inference -- post_training -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: ollama - provider_type: remote::ollama - config: - url: ${env.OLLAMA_URL:=http://localhost:11434} - raise_on_connect_error: true - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - files: - - provider_id: meta-reference-files - provider_type: inline::localfs - config: - storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/ollama/files} - metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/files_metadata.db - post_training: - - provider_id: huggingface - provider_type: inline::huggingface - config: - checkpoint_format: huggingface - distributed_backend: null - device: cpu - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/ollama}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: ollama - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: ollama - provider_model_id: all-minilm:latest - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -server: - port: 8321 diff --git a/llama_stack/templates/passthrough/__init__.py b/llama_stack/templates/passthrough/__init__.py deleted file mode 100644 index 9632c09fb..000000000 --- a/llama_stack/templates/passthrough/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .passthrough import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/passthrough/build.yaml b/llama_stack/templates/passthrough/build.yaml deleted file mode 100644 index e2e041dbc..000000000 --- a/llama_stack/templates/passthrough/build.yaml +++ /dev/null @@ -1,36 +0,0 @@ -version: 2 -distribution_spec: - description: Use Passthrough hosted llama-stack endpoint for LLM inference - providers: - inference: - - remote::passthrough - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - remote::wolfram-alpha - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/passthrough/doc_template.md b/llama_stack/templates/passthrough/doc_template.md deleted file mode 100644 index f9e88873d..000000000 --- a/llama_stack/templates/passthrough/doc_template.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -orphan: true ---- -# Passthrough Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} diff --git a/llama_stack/templates/passthrough/passthrough.py b/llama_stack/templates/passthrough/passthrough.py deleted file mode 100644 index 1b94a9aae..000000000 --- a/llama_stack/templates/passthrough/passthrough.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.passthrough.config import ( - PassthroughImplConfig, -) -from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::passthrough", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "remote::wolfram-alpha", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - - name = "passthrough" - - inference_provider = Provider( - provider_id="passthrough", - provider_type="remote::passthrough", - config=PassthroughImplConfig.sample_run_config(), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - default_models = [ - ModelInput( - metadata={}, - model_id="meta-llama/Llama-3.1-8B-Instruct", - provider_id="passthrough", - provider_model_id="llama3.1-8b-instruct", - model_type=ModelType.llm, - ), - ModelInput( - metadata={}, - model_id="meta-llama/Llama-3.2-11B-Vision-Instruct", - provider_id="passthrough", - provider_model_id="llama3.2-11b-vision-instruct", - model_type=ModelType.llm, - ), - ] - - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::wolfram_alpha", - provider_id="wolfram-alpha", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use Passthrough hosted llama-stack endpoint for LLM inference", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider={ - "passthrough": [ - ProviderModelEntry( - provider_model_id="llama3.1-8b-instruct", - model_type=ModelType.llm, - ), - ProviderModelEntry( - provider_model_id="llama3.2-11b-vision-instruct", - model_type=ModelType.llm, - ), - ], - }, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=default_models + [embedding_model], - default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - embedding_provider, - ], - "vector_io": [vector_io_provider], - "safety": [ - Provider( - provider_id="llama-guard", - provider_type="inline::llama-guard", - config={}, - ), - Provider( - provider_id="llama-guard-vision", - provider_type="inline::llama-guard", - config={}, - ), - Provider( - provider_id="code-scanner", - provider_type="inline::code-scanner", - config={}, - ), - ], - }, - default_models=[ - *default_models, - embedding_model, - ], - default_shields=[ - ShieldInput( - shield_id="meta-llama/Llama-Guard-3-8B", - provider_id="llama-guard", - ), - ShieldInput( - shield_id="meta-llama/Llama-Guard-3-11B-Vision", - provider_id="llama-guard-vision", - ), - ShieldInput( - shield_id="CodeScanner", - provider_id="code-scanner", - ), - ], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "PASSTHROUGH_API_KEY": ( - "", - "Passthrough API Key", - ), - "PASSTHROUGH_URL": ( - "", - "Passthrough URL", - ), - }, - ) diff --git a/llama_stack/templates/passthrough/run-with-safety.yaml b/llama_stack/templates/passthrough/run-with-safety.yaml deleted file mode 100644 index c5b047511..000000000 --- a/llama_stack/templates/passthrough/run-with-safety.yaml +++ /dev/null @@ -1,155 +0,0 @@ -version: 2 -image_name: passthrough -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: passthrough - provider_type: remote::passthrough - config: - url: ${env.PASSTHROUGH_URL} - api_key: ${env.PASSTHROUGH_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: {} - - provider_id: llama-guard-vision - provider_type: inline::llama-guard - config: {} - - provider_id: code-scanner - provider_type: inline::code-scanner - config: {} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/inference_store.db -models: -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: passthrough - provider_model_id: llama3.1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: passthrough - provider_model_id: llama3.2-11b-vision-instruct - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B - provider_id: llama-guard -- shield_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: llama-guard-vision -- shield_id: CodeScanner - provider_id: code-scanner -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/passthrough/run.yaml b/llama_stack/templates/passthrough/run.yaml deleted file mode 100644 index 896b3c91e..000000000 --- a/llama_stack/templates/passthrough/run.yaml +++ /dev/null @@ -1,145 +0,0 @@ -version: 2 -image_name: passthrough -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: passthrough - provider_type: remote::passthrough - config: - url: ${env.PASSTHROUGH_URL} - api_key: ${env.PASSTHROUGH_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/passthrough}/inference_store.db -models: -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: passthrough - provider_model_id: llama3.1-8b-instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: passthrough - provider_model_id: llama3.2-11b-vision-instruct - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/remote-vllm/__init__.py b/llama_stack/templates/remote-vllm/__init__.py deleted file mode 100644 index 7b3d59a01..000000000 --- a/llama_stack/templates/remote-vllm/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .vllm import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml deleted file mode 100644 index 0298b01c7..000000000 --- a/llama_stack/templates/remote-vllm/build.yaml +++ /dev/null @@ -1,36 +0,0 @@ -version: 2 -distribution_spec: - description: Use (an external) vLLM server for running LLM inference - providers: - inference: - - remote::vllm - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - telemetry: - - inline::meta-reference - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol - - remote::wolfram-alpha -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/remote-vllm/doc_template.md b/llama_stack/templates/remote-vllm/doc_template.md deleted file mode 100644 index 5684888da..000000000 --- a/llama_stack/templates/remote-vllm/doc_template.md +++ /dev/null @@ -1,284 +0,0 @@ ---- -orphan: true ---- -# Remote vLLM Distribution -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations: - -{{ providers_table }} - -You can use this distribution if you want to run an independent vLLM server for inference. - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - - -## Setting up vLLM server - -In the following sections, we'll use AMD, NVIDIA or Intel GPUs to serve as hardware accelerators for the vLLM -server, which acts as both the LLM inference provider and the safety provider. Note that vLLM also -[supports many other hardware accelerators](https://docs.vllm.ai/en/latest/getting_started/installation.html) and -that we only use GPUs here for demonstration purposes. Note that if you run into issues, you can include the environment variable `--env VLLM_DEBUG_LOG_API_SERVER_RESPONSE=true` (available in vLLM v0.8.3 and above) in the `docker run` command to enable log response from API server for debugging. - -### Setting up vLLM server on AMD GPU - -AMD provides two main vLLM container options: -- rocm/vllm: Production-ready container -- rocm/vllm-dev: Development container with the latest vLLM features - -Please check the [Blog about ROCm vLLM Usage](https://rocm.blogs.amd.com/software-tools-optimization/vllm-container/README.html) to get more details. - -Here is a sample script to start a ROCm vLLM server locally via Docker: - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export CUDA_VISIBLE_DEVICES=0 -export VLLM_DIMG="rocm/vllm-dev:main" - -docker run \ - --pull always \ - --ipc=host \ - --privileged \ - --shm-size 16g \ - --device=/dev/kfd \ - --device=/dev/dri \ - --group-add video \ - --cap-add=SYS_PTRACE \ - --cap-add=CAP_SYS_ADMIN \ - --security-opt seccomp=unconfined \ - --security-opt apparmor=unconfined \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - $VLLM_DIMG \ - python -m vllm.entrypoints.openai.api_server \ - --model $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html). - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export CUDA_VISIBLE_DEVICES=1 -export VLLM_DIMG="rocm/vllm-dev:main" - -docker run \ - --pull always \ - --ipc=host \ - --privileged \ - --shm-size 16g \ - --device=/dev/kfd \ - --device=/dev/dri \ - --group-add video \ - --cap-add=SYS_PTRACE \ - --cap-add=CAP_SYS_ADMIN \ - --security-opt seccomp=unconfined \ - --security-opt apparmor=unconfined \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env "HIP_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" \ - -p $SAFETY_PORT:$SAFETY_PORT \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - $VLLM_DIMG \ - python -m vllm.entrypoints.openai.api_server \ - --model $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -### Setting up vLLM server on NVIDIA GPU - -Please check the [vLLM Documentation](https://docs.vllm.ai/en/v0.5.5/serving/deploying_with_docker.html) to get a vLLM endpoint. Here is a sample script to start a vLLM server locally via Docker: - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export CUDA_VISIBLE_DEVICES=0 - -docker run \ - --pull always \ - --runtime nvidia \ - --gpus $CUDA_VISIBLE_DEVICES \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - --ipc=host \ - vllm/vllm-openai:latest \ - --gpu-memory-utilization 0.7 \ - --model $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -Note that you'll also need to set `--enable-auto-tool-choice` and `--tool-call-parser` to [enable tool calling in vLLM](https://docs.vllm.ai/en/latest/features/tool_calling.html). - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export CUDA_VISIBLE_DEVICES=1 - -docker run \ - --pull always \ - --runtime nvidia \ - --gpus $CUDA_VISIBLE_DEVICES \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - -p $SAFETY_PORT:$SAFETY_PORT \ - --ipc=host \ - vllm/vllm-openai:latest \ - --gpu-memory-utilization 0.7 \ - --model $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -### Setting up vLLM server on Intel GPU - -Refer to [vLLM Documentation for XPU](https://docs.vllm.ai/en/v0.8.2/getting_started/installation/gpu.html?device=xpu) to get a vLLM endpoint. In addition to vLLM side setup which guides towards installing vLLM from sources orself-building vLLM Docker container, Intel provides prebuilt vLLM container to use on systems with Intel GPUs supported by PyTorch XPU backend: -- [intel/vllm](https://hub.docker.com/r/intel/vllm) - -Here is a sample script to start a vLLM server locally via Docker using Intel provided container: - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-1B-Instruct -export ZE_AFFINITY_MASK=0 - -docker run \ - --pull always \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - --ipc=host \ - intel/vllm:xpu \ - --gpu-memory-utilization 0.7 \ - --model $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a vLLM with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export ZE_AFFINITY_MASK=1 - -docker run \ - --pull always \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ - --env ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK \ - -p $SAFETY_PORT:$SAFETY_PORT \ - --ipc=host \ - intel/vllm:xpu \ - --gpu-memory-utilization 0.7 \ - --model $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -## Running Llama Stack - -Now you are ready to run Llama Stack with vLLM as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export LLAMA_STACK_PORT=8321 - -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./llama_stack/templates/remote-vllm/run.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B - -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - -v ./llama_stack/templates/remote-vllm/run-with-safety.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://host.docker.internal:$INFERENCE_PORT/v1 \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env SAFETY_VLLM_URL=http://host.docker.internal:$SAFETY_PORT/v1 -``` - - -### Via Conda - -Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. - -```bash -export INFERENCE_PORT=8000 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export LLAMA_STACK_PORT=8321 - -cd distributions/remote-vllm -llama stack build --template remote-vllm --image-type conda - -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1 -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B - -llama stack run ./run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env VLLM_URL=http://localhost:$INFERENCE_PORT/v1 \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env SAFETY_VLLM_URL=http://localhost:$SAFETY_PORT/v1 -``` diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml deleted file mode 100644 index e306a771b..000000000 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ /dev/null @@ -1,152 +0,0 @@ -version: 2 -image_name: remote-vllm -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: vllm-inference - provider_type: remote::vllm - config: - url: ${env.VLLM_URL:=http://localhost:8000/v1} - max_tokens: ${env.VLLM_MAX_TOKENS:=4096} - api_token: ${env.VLLM_API_TOKEN:=fake} - tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: vllm-safety - provider_type: remote::vllm - config: - url: ${env.SAFETY_VLLM_URL} - max_tokens: ${env.VLLM_MAX_TOKENS:=4096} - api_token: ${env.VLLM_API_TOKEN:=fake} - tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/responses_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/trace_store.db - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: vllm-inference - model_type: llm -- metadata: {} - model_id: ${env.SAFETY_MODEL} - provider_id: vllm-safety - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: ${env.SAFETY_MODEL} -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -server: - port: 8321 diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml deleted file mode 100644 index 1dbef96a2..000000000 --- a/llama_stack/templates/remote-vllm/run.yaml +++ /dev/null @@ -1,140 +0,0 @@ -version: 2 -image_name: remote-vllm -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: vllm-inference - provider_type: remote::vllm - config: - url: ${env.VLLM_URL:=http://localhost:8000/v1} - max_tokens: ${env.VLLM_MAX_TOKENS:=4096} - api_token: ${env.VLLM_API_TOKEN:=fake} - tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/responses_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/trace_store.db - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: vllm-inference - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -server: - port: 8321 diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py deleted file mode 100644 index a8e1d9a58..000000000 --- a/llama_stack/templates/remote-vllm/vllm.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::vllm", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "telemetry": ["inline::meta-reference"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - "remote::wolfram-alpha", - ], - } - name = "remote-vllm" - inference_provider = Provider( - provider_id="vllm-inference", - provider_type="remote::vllm", - config=VLLMInferenceAdapterConfig.sample_run_config( - url="${env.VLLM_URL:=http://localhost:8000/v1}", - ), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="vllm-inference", - ) - safety_model = ModelInput( - model_id="${env.SAFETY_MODEL}", - provider_id="vllm-safety", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ToolGroupInput( - toolgroup_id="builtin::wolfram_alpha", - provider_id="wolfram-alpha", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use (an external) vLLM server for running LLM inference", - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - Provider( - provider_id="vllm-safety", - provider_type="remote::vllm", - config=VLLMInferenceAdapterConfig.sample_run_config( - url="${env.SAFETY_VLLM_URL}", - ), - ), - embedding_provider, - ], - "vector_io": [vector_io_provider], - }, - default_models=[ - inference_model, - safety_model, - embedding_model, - ], - default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model loaded into the vLLM server", - ), - "VLLM_URL": ( - "http://host.docker.internal:5100/v1", - "URL of the vLLM server with the main inference model", - ), - "MAX_TOKENS": ( - "4096", - "Maximum number of tokens for generation", - ), - "SAFETY_VLLM_URL": ( - "http://host.docker.internal:5101/v1", - "URL of the vLLM server with the safety model", - ), - "SAFETY_MODEL": ( - "meta-llama/Llama-Guard-3-1B", - "Name of the safety (Llama-Guard) model to use", - ), - }, - ) diff --git a/llama_stack/templates/sambanova/__init__.py b/llama_stack/templates/sambanova/__init__.py deleted file mode 100644 index 30209fb7f..000000000 --- a/llama_stack/templates/sambanova/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .sambanova import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/sambanova/build.yaml b/llama_stack/templates/sambanova/build.yaml deleted file mode 100644 index ba70f88c6..000000000 --- a/llama_stack/templates/sambanova/build.yaml +++ /dev/null @@ -1,27 +0,0 @@ -version: 2 -distribution_spec: - description: Use SambaNova for running LLM inference and safety - providers: - inference: - - remote::sambanova - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - remote::sambanova - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol - - remote::wolfram-alpha -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/sambanova/doc_template.md b/llama_stack/templates/sambanova/doc_template.md deleted file mode 100644 index 1dc76fd3f..000000000 --- a/llama_stack/templates/sambanova/doc_template.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -orphan: true ---- -# SambaNova Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -### Prerequisite: API Keys - -Make sure you have access to a SambaNova API Key. You can get one by visiting [SambaNova.ai](http://cloud.sambanova.ai?utm_source=llamastack&utm_medium=external&utm_campaign=cloud_signup). - - -## Running Llama Stack with SambaNova - -You can do this via Conda (build code) or Docker which has a pre-built image. - - -### Via Docker - -```bash -LLAMA_STACK_PORT=8321 -llama stack build --template sambanova --image-type container -docker run \ - -it \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY -``` - - -### Via Venv - -```bash -llama stack build --template sambanova --image-type venv -llama stack run --image-type venv ~/.llama/distributions/sambanova/sambanova-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY -``` - - -### Via Conda - -```bash -llama stack build --template sambanova --image-type conda -llama stack run --image-type conda ~/.llama/distributions/sambanova/sambanova-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env SAMBANOVA_API_KEY=$SAMBANOVA_API_KEY -``` diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml deleted file mode 100644 index b96621b58..000000000 --- a/llama_stack/templates/sambanova/run.yaml +++ /dev/null @@ -1,214 +0,0 @@ -version: 2 -image_name: sambanova -apis: -- agents -- inference -- safety -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: sambanova - provider_type: remote::sambanova - config: - url: https://api.sambanova.ai/v1 - api_key: ${env.SAMBANOVA_API_KEY} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/sambanova}/faiss_store.db - - provider_id: ${env.ENABLE_CHROMADB:+chromadb} - provider_type: remote::chromadb - config: - url: ${env.CHROMADB_URL:+} - - provider_id: ${env.ENABLE_PGVECTOR:+pgvector} - provider_type: remote::pgvector - config: - host: ${env.PGVECTOR_HOST:=localhost} - port: ${env.PGVECTOR_PORT:=5432} - db: ${env.PGVECTOR_DB:+} - user: ${env.PGVECTOR_USER:+} - password: ${env.PGVECTOR_PASSWORD:+} - safety: - - provider_id: sambanova - provider_type: remote::sambanova - config: - url: https://api.sambanova.ai/v1 - api_key: ${env.SAMBANOVA_API_KEY} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/sambanova}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/sambanova}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/sambanova}/trace_store.db - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/sambanova}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/sambanova}/inference_store.db -models: -- metadata: {} - model_id: sambanova/Meta-Llama-3.1-8B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Meta-Llama-3.1-405B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Meta-Llama-3.2-1B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-1B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Meta-Llama-3.2-3B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Meta-Llama-3.3-70B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Llama-3.2-11B-Vision-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Llama-3.2-90B-Vision-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Llama-4-Scout-17B-16E-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: sambanova - provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct - model_type: llm -- metadata: {} - model_id: sambanova/Meta-Llama-Guard-3-8B - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-Guard-3-8B - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-8B - provider_id: sambanova - provider_model_id: sambanova/Meta-Llama-Guard-3-8B - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B - provider_shield_id: sambanova/Meta-Llama-Guard-3-8B -- shield_id: sambanova/Meta-Llama-Guard-3-8B - provider_shield_id: sambanova/Meta-Llama-Guard-3-8B -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -server: - port: 8321 diff --git a/llama_stack/templates/sambanova/sambanova.py b/llama_stack/templates/sambanova/sambanova.py deleted file mode 100644 index 428577697..000000000 --- a/llama_stack/templates/sambanova/sambanova.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.sambanova import SambaNovaImplConfig -from llama_stack.providers.remote.inference.sambanova.models import MODEL_ENTRIES -from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig -from llama_stack.providers.remote.vector_io.pgvector.config import ( - PGVectorVectorIOConfig, -) -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::sambanova", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["remote::sambanova"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - "remote::wolfram-alpha", - ], - } - name = "sambanova" - inference_provider = Provider( - provider_id=name, - provider_type=f"remote::{name}", - config=SambaNovaImplConfig.sample_run_config(), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - vector_io_providers = [ - Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config( - __distro_dir__=f"~/.llama/distributions/{name}", - ), - ), - Provider( - provider_id="${env.ENABLE_CHROMADB:+chromadb}", - provider_type="remote::chromadb", - config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:+}"), - ), - Provider( - provider_id="${env.ENABLE_PGVECTOR:+pgvector}", - provider_type="remote::pgvector", - config=PGVectorVectorIOConfig.sample_run_config( - db="${env.PGVECTOR_DB:+}", - user="${env.PGVECTOR_USER:+}", - password="${env.PGVECTOR_PASSWORD:+}", - ), - ), - ] - - available_models = { - name: MODEL_ENTRIES, - } - default_models = get_model_registry(available_models) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ToolGroupInput( - toolgroup_id="builtin::wolfram_alpha", - provider_id="wolfram-alpha", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use SambaNova for running LLM inference and safety", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": vector_io_providers, - }, - default_models=default_models + [embedding_model], - default_shields=[ - ShieldInput( - shield_id="meta-llama/Llama-Guard-3-8B", provider_shield_id="sambanova/Meta-Llama-Guard-3-8B" - ), - ShieldInput( - shield_id="sambanova/Meta-Llama-Guard-3-8B", - provider_shield_id="sambanova/Meta-Llama-Guard-3-8B", - ), - ], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMASTACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "SAMBANOVA_API_KEY": ( - "", - "SambaNova API Key", - ), - }, - ) diff --git a/llama_stack/templates/starter/build.yaml b/llama_stack/templates/starter/build.yaml index 3b48dcf7a..0f61ea91e 100644 --- a/llama_stack/templates/starter/build.yaml +++ b/llama_stack/templates/starter/build.yaml @@ -12,6 +12,14 @@ distribution_spec: - remote::groq - remote::sambanova - remote::vllm + - remote::tgi + - remote::cerebras + - remote::llama-openai-compat + - remote::nvidia + - remote::hf::serverless + - remote::hf::endpoint + - remote::bedrock + - remote::passthrough - inline::sentence-transformers vector_io: - inline::sqlite-vec @@ -25,6 +33,8 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + post_training: + - inline::huggingface eval: - inline::meta-reference datasetio: diff --git a/llama_stack/templates/starter/run.yaml b/llama_stack/templates/starter/run.yaml index 00faf029e..fbc2c829a 100644 --- a/llama_stack/templates/starter/run.yaml +++ b/llama_stack/templates/starter/run.yaml @@ -6,6 +6,7 @@ apis: - eval - files - inference +- post_training - safety - scoring - telemetry @@ -13,70 +14,107 @@ apis: - vector_io providers: inference: - - provider_id: openai + - provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_type: remote::openai config: api_key: ${env.OPENAI_API_KEY:+} - - provider_id: fireworks + - provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_type: remote::fireworks config: url: https://api.fireworks.ai/inference/v1 api_key: ${env.FIREWORKS_API_KEY:+} - - provider_id: together + - provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_type: remote::together config: url: https://api.together.xyz/v1 api_key: ${env.TOGETHER_API_KEY:+} - - provider_id: ollama + - provider_id: ${env.ENABLE_OLLAMA:=__disabled__} provider_type: remote::ollama config: url: ${env.OLLAMA_URL:=http://localhost:11434} - raise_on_connect_error: false - - provider_id: anthropic + - provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} provider_type: remote::anthropic config: api_key: ${env.ANTHROPIC_API_KEY:+} - - provider_id: gemini + - provider_id: ${env.ENABLE_GEMINI:=__disabled__} provider_type: remote::gemini config: api_key: ${env.GEMINI_API_KEY:+} - - provider_id: groq + - provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_type: remote::groq config: url: https://api.groq.com api_key: ${env.GROQ_API_KEY:+} - - provider_id: sambanova + - provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_type: remote::sambanova config: url: https://api.sambanova.ai/v1 api_key: ${env.SAMBANOVA_API_KEY:+} - - provider_id: vllm + - provider_id: ${env.ENABLE_VLLM:=__disabled__} provider_type: remote::vllm config: url: ${env.VLLM_URL:=http://localhost:8000/v1} max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_token: ${env.VLLM_API_TOKEN:=fake} tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: sentence-transformers + - provider_id: ${env.ENABLE_TGI:=__disabled__} + provider_type: remote::tgi + config: + url: ${env.TGI_URL:+} + - provider_id: ${env.ENABLE_CEREBRAS:=__disabled__} + provider_type: remote::cerebras + config: + base_url: https://api.cerebras.ai + api_key: ${env.CEREBRAS_API_KEY:+} + - provider_id: ${env.ENABLE_LLAMA_OPENAI_COMPAT:=__disabled__} + provider_type: remote::llama-openai-compat + config: + openai_compat_api_base: https://api.llama.com/compat/v1/ + api_key: ${env.LLAMA_API_KEY:+:} + - provider_id: ${env.ENABLE_NVIDIA:=__disabled__} + provider_type: remote::nvidia + config: + url: ${env.NVIDIA_BASE_URL:__disabled__} + api_key: ${env.NVIDIA_API_KEY:+} + append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} + - provider_id: ${env.ENABLE_HF_SERVERLESS:=__disabled__} + provider_type: remote::hf::serverless + config: + huggingface_repo: ${env.INFERENCE_MODEL:+:} + api_token: ${env.HF_API_TOKEN:+:} + - provider_id: ${env.ENABLE_HF_ENDPOINT:=__disabled__} + provider_type: remote::hf::endpoint + config: + endpoint_name: ${env.INFERENCE_ENDPOINT_NAME:+:} + api_token: ${env.HF_API_TOKEN:+:} + - provider_id: ${env.ENABLE_BEDROCK:=__disabled__} + provider_type: remote::bedrock + config: {} + - provider_id: ${env.ENABLE_PASSTHROUGH:=__disabled__} + provider_type: remote::passthrough + config: + url: ${env.PASSTHROUGH_URL:+:} + api_key: ${env.PASSTHROUGH_API_KEY:+:} + - provider_id: ${env.ENABLE_SENTENCE_TRANSFORMERS:=sentence-transformers} provider_type: inline::sentence-transformers config: {} vector_io: - - provider_id: faiss + - provider_id: ${env.ENABLE_FAISS:=faiss} provider_type: inline::faiss config: kvstore: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db - - provider_id: ${env.ENABLE_SQLITE_VEC:+sqlite-vec} + - provider_id: ${env.ENABLE_SQLITE_VEC:=__disabled__} provider_type: inline::sqlite-vec config: db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db - - provider_id: ${env.ENABLE_CHROMADB:+chromadb} + - provider_id: ${env.ENABLE_CHROMADB:=__disabled__} provider_type: remote::chromadb config: url: ${env.CHROMADB_URL:+} - - provider_id: ${env.ENABLE_PGVECTOR:+pgvector} + - provider_id: ${env.ENABLE_PGVECTOR:=__disabled__} provider_type: remote::pgvector config: host: ${env.PGVECTOR_HOST:=localhost} @@ -115,6 +153,13 @@ providers: service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" sinks: ${env.TELEMETRY_SINKS:=console,sqlite} sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/trace_store.db + post_training: + - provider_id: huggingface + provider_type: inline::huggingface + config: + checkpoint_format: huggingface + distributed_backend: null + device: cpu eval: - provider_id: meta-reference provider_type: inline::meta-reference @@ -174,645 +219,649 @@ inference_store: db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/inference_store.db models: - metadata: {} - model_id: openai/gpt-4o - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/gpt-4o + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: openai/gpt-4o model_type: llm - metadata: {} - model_id: openai/gpt-4o-mini - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/gpt-4o-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: openai/gpt-4o-mini model_type: llm - metadata: {} - model_id: openai/chatgpt-4o-latest - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/chatgpt-4o-latest + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: openai/chatgpt-4o-latest model_type: llm - metadata: {} - model_id: openai/gpt-3.5-turbo-0125 - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-3.5-turbo-0125 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-3.5-turbo-0125 model_type: llm - metadata: {} - model_id: openai/gpt-3.5-turbo - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-3.5-turbo + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-3.5-turbo model_type: llm - metadata: {} - model_id: openai/gpt-3.5-turbo-instruct - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-3.5-turbo-instruct + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-3.5-turbo-instruct model_type: llm - metadata: {} - model_id: openai/gpt-4 - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-4 model_type: llm - metadata: {} - model_id: openai/gpt-4-turbo - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4-turbo + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-4-turbo model_type: llm - metadata: {} - model_id: openai/gpt-4o - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-4o model_type: llm - metadata: {} - model_id: openai/gpt-4o-2024-08-06 - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o-2024-08-06 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-4o-2024-08-06 model_type: llm - metadata: {} - model_id: openai/gpt-4o-mini - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-4o-mini model_type: llm - metadata: {} - model_id: openai/gpt-4o-audio-preview - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/gpt-4o-audio-preview + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: gpt-4o-audio-preview model_type: llm - metadata: {} - model_id: openai/chatgpt-4o-latest - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/chatgpt-4o-latest + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: chatgpt-4o-latest model_type: llm - metadata: {} - model_id: openai/o1 - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o1 + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: o1 model_type: llm - metadata: {} - model_id: openai/o1-mini - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o1-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: o1-mini model_type: llm - metadata: {} - model_id: openai/o3-mini - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o3-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: o3-mini model_type: llm - metadata: {} - model_id: openai/o4-mini - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/o4-mini + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: o4-mini model_type: llm - metadata: embedding_dimension: 1536 context_length: 8192 - model_id: openai/text-embedding-3-small - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/text-embedding-3-small + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: openai/text-embedding-3-small model_type: embedding - metadata: embedding_dimension: 3072 context_length: 8192 - model_id: openai/text-embedding-3-large - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/openai/text-embedding-3-large + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: openai/text-embedding-3-large model_type: embedding - metadata: embedding_dimension: 1536 context_length: 8192 - model_id: openai/text-embedding-3-small - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/text-embedding-3-small + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: text-embedding-3-small model_type: embedding - metadata: embedding_dimension: 3072 context_length: 8192 - model_id: openai/text-embedding-3-large - provider_id: openai + model_id: ${env.ENABLE_OPENAI:=__disabled__}/text-embedding-3-large + provider_id: ${env.ENABLE_OPENAI:=__disabled__} provider_model_id: text-embedding-3-large model_type: embedding - metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-8b-instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p1-8b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-3.1-8B-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p1-8b-instruct model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-70b-instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p1-70b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-3.1-70B-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.1-70B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p1-70b-instruct model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-v3p1-405b-instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p1-405b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p1-405b-instruct model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-3b-instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p2-3b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-3.2-3B-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p2-3b-instruct model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p2-11b-vision-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p2-11b-vision-instruct model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p2-90b-vision-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p2-90b-vision-instruct model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-v3p3-70b-instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-v3p3-70b-instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-3.3-70B-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-v3p3-70b-instruct model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-8b - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-guard-3-8b + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-guard-3-8b model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-Guard-3-8B - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-Guard-3-8B + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-guard-3-8b model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama-guard-3-11b-vision - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama-guard-3-11b-vision + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-Guard-3-11B-Vision - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-Guard-3-11B-Vision + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama-guard-3-11b-vision model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama4-scout-instruct-basic - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama4-scout-instruct-basic + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama4-scout-instruct-basic model_type: llm - metadata: {} - model_id: accounts/fireworks/models/llama4-maverick-instruct-basic - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/accounts/fireworks/models/llama4-maverick-instruct-basic + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic model_type: llm - metadata: {} - model_id: fireworks/meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: accounts/fireworks/models/llama4-maverick-instruct-basic model_type: llm - metadata: embedding_dimension: 768 context_length: 8192 - model_id: fireworks/nomic-ai/nomic-embed-text-v1.5 - provider_id: fireworks + model_id: ${env.ENABLE_FIREWORKS:=__disabled__}/nomic-ai/nomic-embed-text-v1.5 + provider_id: ${env.ENABLE_FIREWORKS:=__disabled__} provider_model_id: nomic-ai/nomic-embed-text-v1.5 model_type: embedding - metadata: {} - model_id: together/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.1-8B-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.1-70B-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.1-70B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.2-3B-Instruct-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.2-3B-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.3-70B-Instruct-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-3.3-70B-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Meta-Llama-Guard-3-8B - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Meta-Llama-Guard-3-8B + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-Guard-3-8B model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-Guard-3-8B - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-Guard-3-8B + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Meta-Llama-Guard-3-8B model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-Guard-3-11B-Vision-Turbo - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-Guard-3-11B-Vision-Turbo + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-Guard-3-11B-Vision - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-Guard-3-11B-Vision + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo model_type: llm - metadata: embedding_dimension: 768 context_length: 8192 - model_id: togethercomputer/m2-bert-80M-8k-retrieval - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/togethercomputer/m2-bert-80M-8k-retrieval + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval model_type: embedding - metadata: embedding_dimension: 768 context_length: 32768 - model_id: togethercomputer/m2-bert-80M-32k-retrieval - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/togethercomputer/m2-bert-80M-32k-retrieval + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval model_type: embedding - metadata: {} - model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/together/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 model_type: llm - metadata: {} - model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - provider_id: together + model_id: ${env.ENABLE_TOGETHER:=__disabled__}/together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 + provider_id: ${env.ENABLE_TOGETHER:=__disabled__} provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 model_type: llm - metadata: {} - model_id: ollama/${env.OLLAMA_INFERENCE_MODEL:=__disabled__} - provider_id: ollama + model_id: ${env.ENABLE_OLLAMA:=__disabled__}/${env.OLLAMA_INFERENCE_MODEL:=__disabled__} + provider_id: ${env.ENABLE_OLLAMA:=__disabled__} provider_model_id: ${env.OLLAMA_INFERENCE_MODEL:=__disabled__} model_type: llm - metadata: embedding_dimension: ${env.OLLAMA_EMBEDDING_DIMENSION:=384} - model_id: ollama/${env.OLLAMA_EMBEDDING_MODEL:=__disabled__} - provider_id: ollama + model_id: ${env.ENABLE_OLLAMA:=__disabled__}/${env.OLLAMA_EMBEDDING_MODEL:=__disabled__} + provider_id: ${env.ENABLE_OLLAMA:=__disabled__} provider_model_id: ${env.OLLAMA_EMBEDDING_MODEL:=__disabled__} model_type: embedding - metadata: {} - model_id: anthropic/claude-3-5-sonnet-latest - provider_id: anthropic + model_id: ${env.ENABLE_OLLAMA:=__disabled__}/${env.OLLAMA_SAFETY_MODEL:=__disabled__} + provider_id: ${env.ENABLE_OLLAMA:=__disabled__} + provider_model_id: ${env.OLLAMA_SAFETY_MODEL:=__disabled__} + model_type: llm +- metadata: {} + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/claude-3-5-sonnet-latest + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} provider_model_id: anthropic/claude-3-5-sonnet-latest model_type: llm - metadata: {} - model_id: anthropic/claude-3-7-sonnet-latest - provider_id: anthropic + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/claude-3-7-sonnet-latest + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} provider_model_id: anthropic/claude-3-7-sonnet-latest model_type: llm - metadata: {} - model_id: anthropic/claude-3-5-haiku-latest - provider_id: anthropic + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/claude-3-5-haiku-latest + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} provider_model_id: anthropic/claude-3-5-haiku-latest model_type: llm - metadata: embedding_dimension: 1024 context_length: 32000 - model_id: anthropic/voyage-3 - provider_id: anthropic + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/voyage-3 + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} provider_model_id: anthropic/voyage-3 model_type: embedding - metadata: embedding_dimension: 512 context_length: 32000 - model_id: anthropic/voyage-3-lite - provider_id: anthropic + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/voyage-3-lite + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} provider_model_id: anthropic/voyage-3-lite model_type: embedding - metadata: embedding_dimension: 1024 context_length: 32000 - model_id: anthropic/voyage-code-3 - provider_id: anthropic + model_id: ${env.ENABLE_ANTHROPIC:=__disabled__}/anthropic/voyage-code-3 + provider_id: ${env.ENABLE_ANTHROPIC:=__disabled__} provider_model_id: anthropic/voyage-code-3 model_type: embedding - metadata: {} - model_id: gemini/gemini-1.5-flash - provider_id: gemini + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-1.5-flash + provider_id: ${env.ENABLE_GEMINI:=__disabled__} provider_model_id: gemini/gemini-1.5-flash model_type: llm - metadata: {} - model_id: gemini/gemini-1.5-pro - provider_id: gemini + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-1.5-pro + provider_id: ${env.ENABLE_GEMINI:=__disabled__} provider_model_id: gemini/gemini-1.5-pro model_type: llm - metadata: {} - model_id: gemini/gemini-2.0-flash - provider_id: gemini + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-2.0-flash + provider_id: ${env.ENABLE_GEMINI:=__disabled__} provider_model_id: gemini/gemini-2.0-flash model_type: llm - metadata: {} - model_id: gemini/gemini-2.5-flash - provider_id: gemini + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-2.5-flash + provider_id: ${env.ENABLE_GEMINI:=__disabled__} provider_model_id: gemini/gemini-2.5-flash model_type: llm - metadata: {} - model_id: gemini/gemini-2.5-pro - provider_id: gemini + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/gemini-2.5-pro + provider_id: ${env.ENABLE_GEMINI:=__disabled__} provider_model_id: gemini/gemini-2.5-pro model_type: llm - metadata: embedding_dimension: 768 context_length: 2048 - model_id: gemini/text-embedding-004 - provider_id: gemini + model_id: ${env.ENABLE_GEMINI:=__disabled__}/gemini/text-embedding-004 + provider_id: ${env.ENABLE_GEMINI:=__disabled__} provider_model_id: gemini/text-embedding-004 model_type: embedding - metadata: {} - model_id: groq/llama3-8b-8192 - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama3-8b-8192 + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama3-8b-8192 model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-3.1-8B-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama3-8b-8192 model_type: llm - metadata: {} - model_id: groq/llama-3.1-8b-instant - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-3.1-8b-instant + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-3.1-8b-instant model_type: llm - metadata: {} - model_id: groq/llama3-70b-8192 - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama3-70b-8192 + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama3-70b-8192 model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-3-70B-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3-70B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama3-70b-8192 model_type: llm - metadata: {} - model_id: groq/llama-3.3-70b-versatile - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-3.3-70b-versatile + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-3.3-70b-versatile model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-3.3-70B-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-3.3-70b-versatile model_type: llm - metadata: {} - model_id: groq/llama-3.2-3b-preview - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-3.2-3b-preview + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-3.2-3b-preview model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-3.2-3B-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-3.2-3b-preview model_type: llm - metadata: {} - model_id: groq/llama-4-scout-17b-16e-instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-4-scout-17b-16e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-4-scout-17b-16e-instruct model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-4-scout-17b-16e-instruct model_type: llm - metadata: {} - model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/meta-llama/llama-4-scout-17b-16e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/meta-llama/llama-4-scout-17b-16e-instruct model_type: llm - metadata: {} - model_id: groq/llama-4-maverick-17b-128e-instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/llama-4-maverick-17b-128e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-4-maverick-17b-128e-instruct model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/llama-4-maverick-17b-128e-instruct model_type: llm - metadata: {} - model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/groq/meta-llama/llama-4-maverick-17b-128e-instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct model_type: llm - metadata: {} - model_id: groq/meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: groq + model_id: ${env.ENABLE_GROQ:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_GROQ:=__disabled__} provider_model_id: groq/meta-llama/llama-4-maverick-17b-128e-instruct model_type: llm - metadata: {} - model_id: sambanova/Meta-Llama-3.1-8B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-3.1-8B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.1-8B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.1-8B-Instruct model_type: llm - metadata: {} - model_id: sambanova/Meta-Llama-3.1-405B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.1-405B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.1-405B-Instruct-FP8 + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.1-405B-Instruct model_type: llm - metadata: {} - model_id: sambanova/Meta-Llama-3.2-1B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.2-1B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-3.2-1B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-1B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.2-1B-Instruct model_type: llm - metadata: {} - model_id: sambanova/Meta-Llama-3.2-3B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-3.2-3B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-3B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.2-3B-Instruct model_type: llm - metadata: {} - model_id: sambanova/Meta-Llama-3.3-70B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-3.3-70B-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.3-70B-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-3.3-70B-Instruct model_type: llm - metadata: {} - model_id: sambanova/Llama-3.2-11B-Vision-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-11B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-3.2-11B-Vision-Instruct model_type: llm - metadata: {} - model_id: sambanova/Llama-3.2-90B-Vision-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-3.2-90B-Vision-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-3.2-90B-Vision-Instruct model_type: llm - metadata: {} - model_id: sambanova/Llama-4-Scout-17B-16E-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-4-Scout-17B-16E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-4-Scout-17B-16E-Instruct model_type: llm - metadata: {} - model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-4-Maverick-17B-128E-Instruct + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Llama-4-Maverick-17B-128E-Instruct model_type: llm - metadata: {} - model_id: sambanova/Meta-Llama-Guard-3-8B - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/sambanova/Meta-Llama-Guard-3-8B + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-Guard-3-8B model_type: llm - metadata: {} - model_id: sambanova/meta-llama/Llama-Guard-3-8B - provider_id: sambanova + model_id: ${env.ENABLE_SAMBANOVA:=__disabled__}/meta-llama/Llama-Guard-3-8B + provider_id: ${env.ENABLE_SAMBANOVA:=__disabled__} provider_model_id: sambanova/Meta-Llama-Guard-3-8B model_type: llm - metadata: {} - model_id: vllm/${env.VLLM_INFERENCE_MODEL:=__disabled__} - provider_id: vllm + model_id: ${env.ENABLE_VLLM:=__disabled__}/${env.VLLM_INFERENCE_MODEL:=__disabled__} + provider_id: ${env.ENABLE_VLLM:=__disabled__} provider_model_id: ${env.VLLM_INFERENCE_MODEL:=__disabled__} model_type: llm - metadata: embedding_dimension: 384 model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers + provider_id: ${env.ENABLE_SENTENCE_TRANSFORMERS:=sentence-transformers} model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B +shields: [] vector_dbs: [] datasets: [] scoring_fns: [] diff --git a/llama_stack/templates/starter/starter.py b/llama_stack/templates/starter/starter.py index c0f2646d7..bbeef6b72 100644 --- a/llama_stack/templates/starter/starter.py +++ b/llama_stack/templates/starter/starter.py @@ -9,13 +9,13 @@ from llama_stack.apis.models import ModelType from llama_stack.distribution.datatypes import ( ModelInput, Provider, - ShieldInput, ToolGroupInput, ) from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) +from llama_stack.providers.inline.post_training.huggingface import HuggingFacePostTrainingConfig from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( SQLiteVectorIOConfig, @@ -24,6 +24,7 @@ from llama_stack.providers.remote.inference.anthropic.config import AnthropicCon from llama_stack.providers.remote.inference.anthropic.models import ( MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES, ) +from llama_stack.providers.remote.inference.cerebras.config import CerebrasImplConfig from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks.models import ( MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES, @@ -36,15 +37,24 @@ from llama_stack.providers.remote.inference.groq.config import GroqConfig from llama_stack.providers.remote.inference.groq.models import ( MODEL_ENTRIES as GROQ_MODEL_ENTRIES, ) +from llama_stack.providers.remote.inference.llama_openai_compat.config import ( + LlamaCompatConfig, +) +from llama_stack.providers.remote.inference.nvidia.config import NVIDIAConfig from llama_stack.providers.remote.inference.ollama.config import OllamaImplConfig from llama_stack.providers.remote.inference.openai.config import OpenAIConfig from llama_stack.providers.remote.inference.openai.models import ( MODEL_ENTRIES as OPENAI_MODEL_ENTRIES, ) +from llama_stack.providers.remote.inference.passthrough.config import ( + PassthroughImplConfig, +) from llama_stack.providers.remote.inference.sambanova.config import SambaNovaImplConfig from llama_stack.providers.remote.inference.sambanova.models import ( MODEL_ENTRIES as SAMBANOVA_MODEL_ENTRIES, ) +from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig +from llama_stack.providers.remote.inference.tgi.config import InferenceAPIImplConfig, TGIImplConfig from llama_stack.providers.remote.inference.together.config import TogetherImplConfig from llama_stack.providers.remote.inference.together.models import ( MODEL_ENTRIES as TOGETHER_MODEL_ENTRIES, @@ -54,6 +64,7 @@ from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOC from llama_stack.providers.remote.vector_io.pgvector.config import ( PGVectorVectorIOConfig, ) +from llama_stack.providers.utils.bedrock.config import BedrockBaseConfig from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry from llama_stack.providers.utils.sqlstore.sqlstore import PostgresSqlStoreConfig from llama_stack.templates.template import ( @@ -67,21 +78,25 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo # in this template, we allow each API key to be optional providers = [ ( + "${env.ENABLE_OPENAI:=__disabled__}", "openai", OPENAI_MODEL_ENTRIES, OpenAIConfig.sample_run_config(api_key="${env.OPENAI_API_KEY:+}"), ), ( + "${env.ENABLE_FIREWORKS:=__disabled__}", "fireworks", FIREWORKS_MODEL_ENTRIES, FireworksImplConfig.sample_run_config(api_key="${env.FIREWORKS_API_KEY:+}"), ), ( + "${env.ENABLE_TOGETHER:=__disabled__}", "together", TOGETHER_MODEL_ENTRIES, TogetherImplConfig.sample_run_config(api_key="${env.TOGETHER_API_KEY:+}"), ), ( + "${env.ENABLE_OLLAMA:=__disabled__}", "ollama", [ ProviderModelEntry( @@ -95,32 +110,41 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo "embedding_dimension": "${env.OLLAMA_EMBEDDING_DIMENSION:=384}", }, ), + ProviderModelEntry( + provider_model_id="${env.OLLAMA_SAFETY_MODEL:=__disabled__}", + model_type=ModelType.llm, + ), ], OllamaImplConfig.sample_run_config( url="${env.OLLAMA_URL:=http://localhost:11434}", raise_on_connect_error=False ), ), ( + "${env.ENABLE_ANTHROPIC:=__disabled__}", "anthropic", ANTHROPIC_MODEL_ENTRIES, AnthropicConfig.sample_run_config(api_key="${env.ANTHROPIC_API_KEY:+}"), ), ( + "${env.ENABLE_GEMINI:=__disabled__}", "gemini", GEMINI_MODEL_ENTRIES, GeminiConfig.sample_run_config(api_key="${env.GEMINI_API_KEY:+}"), ), ( + "${env.ENABLE_GROQ:=__disabled__}", "groq", GROQ_MODEL_ENTRIES, GroqConfig.sample_run_config(api_key="${env.GROQ_API_KEY:+}"), ), ( + "${env.ENABLE_SAMBANOVA:=__disabled__}", "sambanova", SAMBANOVA_MODEL_ENTRIES, SambaNovaImplConfig.sample_run_config(api_key="${env.SAMBANOVA_API_KEY:+}"), ), ( + "${env.ENABLE_VLLM:=__disabled__}", "vllm", [ ProviderModelEntry( @@ -132,14 +156,88 @@ def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderMo url="${env.VLLM_URL:=http://localhost:8000/v1}", ), ), + ( + "${env.ENABLE_TGI:=__disabled__}", + "tgi", + [], + TGIImplConfig.sample_run_config( + url="${env.TGI_URL:+}", + endpoint_name="${env.INFERENCE_ENDPOINT_NAME:+}", + ), + ), + # TODO: re-add once the Python 3.13 issue is fixed + # discussion: https://github.com/meta-llama/llama-stack/pull/2327#discussion_r2156883828 + # ( + # "watsonx", + # [], + # WatsonXConfig.sample_run_config(api_key="${env.WATSONX_API_KEY:}"), + # ), + ( + "${env.ENABLE_CEREBRAS:=__disabled__}", + "cerebras", + [], + CerebrasImplConfig.sample_run_config(api_key="${env.CEREBRAS_API_KEY:+}"), + ), + ( + "${env.ENABLE_LLAMA_OPENAI_COMPAT:=__disabled__}", + "llama-openai-compat", + [], + LlamaCompatConfig.sample_run_config(api_key="${env.LLAMA_API_KEY:+:}"), + ), + ( + "${env.ENABLE_NVIDIA:=__disabled__}", + "nvidia", + [], + NVIDIAConfig.sample_run_config( + api_key="${env.NVIDIA_API_KEY:+}", + url="${env.NVIDIA_BASE_URL:__disabled__}", + ), + ), + ( + "${env.ENABLE_HF_SERVERLESS:=__disabled__}", + "hf::serverless", + [], + InferenceAPIImplConfig.sample_run_config( + api_token="${env.HF_API_TOKEN:+:}", + repo="${env.INFERENCE_MODEL:+:}", + ), + ), + ( + "${env.ENABLE_HF_ENDPOINT:=__disabled__}", + "hf::endpoint", + [], + InferenceEndpointImplConfig.sample_run_config( + api_token="${env.HF_API_TOKEN:+:}", + endpoint_name="${env.INFERENCE_ENDPOINT_NAME:+:}", + ), + ), + ( + "${env.ENABLE_BEDROCK:=__disabled__}", + "bedrock", + [], + BedrockBaseConfig.sample_run_config( + aws_access_key_id="${env.AWS_ACCESS_KEY_ID:+}", + aws_secret_access_key="${env.AWS_SECRET_ACCESS_KEY:+}", + aws_session_token="${env.AWS_SESSION_TOKEN:+}", + region_name="${env.AWS_DEFAULT_REGION:+}", + ), + ), + ( + "${env.ENABLE_PASSTHROUGH:=__disabled__}", + "passthrough", + [], + PassthroughImplConfig.sample_run_config( + url="${env.PASSTHROUGH_URL:+:}", api_key="${env.PASSTHROUGH_API_KEY:+:}" + ), + ), ] inference_providers = [] available_models = {} - for provider_id, model_entries, config in providers: + for provider_id, provider_type, model_entries, config in providers: inference_providers.append( Provider( provider_id=provider_id, - provider_type=f"remote::{provider_id}", + provider_type=f"remote::{provider_type}", config=config, ) ) @@ -156,6 +254,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "post_training": ["inline::huggingface"], "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], @@ -170,22 +269,22 @@ def get_distribution_template() -> DistributionTemplate: vector_io_providers = [ Provider( - provider_id="faiss", + provider_id="${env.ENABLE_FAISS:=faiss}", provider_type="inline::faiss", config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), ), Provider( - provider_id="${env.ENABLE_SQLITE_VEC:+sqlite-vec}", + provider_id="${env.ENABLE_SQLITE_VEC:=__disabled__}", provider_type="inline::sqlite-vec", config=SQLiteVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), ), Provider( - provider_id="${env.ENABLE_CHROMADB:+chromadb}", + provider_id="${env.ENABLE_CHROMADB:=__disabled__}", provider_type="remote::chromadb", config=ChromaVectorIOConfig.sample_run_config(url="${env.CHROMADB_URL:+}"), ), Provider( - provider_id="${env.ENABLE_PGVECTOR:+pgvector}", + provider_id="${env.ENABLE_PGVECTOR:=__disabled__}", provider_type="remote::pgvector", config=PGVectorVectorIOConfig.sample_run_config( db="${env.PGVECTOR_DB:+}", @@ -200,11 +299,15 @@ def get_distribution_template() -> DistributionTemplate: config=LocalfsFilesImplConfig.sample_run_config(f"~/.llama/distributions/{name}"), ) embedding_provider = Provider( - provider_id="sentence-transformers", + provider_id="${env.ENABLE_SENTENCE_TRANSFORMERS:=sentence-transformers}", provider_type="inline::sentence-transformers", config=SentenceTransformersInferenceConfig.sample_run_config(), ) - + post_training_provider = Provider( + provider_id="huggingface", + provider_type="inline::huggingface", + config=HuggingFacePostTrainingConfig.sample_run_config(f"~/.llama/distributions/{name}"), + ) default_tool_groups = [ ToolGroupInput( toolgroup_id="builtin::websearch", @@ -242,10 +345,14 @@ def get_distribution_template() -> DistributionTemplate: "inference": inference_providers + [embedding_provider], "vector_io": vector_io_providers, "files": [files_provider], + "post_training": [post_training_provider], }, default_models=default_models + [embedding_model], default_tool_groups=default_tool_groups, - default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], + # TODO: add a way to enable/disable shields on the fly + # default_shields=[ + # ShieldInput(provider_id="llama-guard", shield_id="${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-8B}") + # ], ), }, run_config_env_vars={ diff --git a/llama_stack/templates/tgi/__init__.py b/llama_stack/templates/tgi/__init__.py deleted file mode 100644 index fa1932f6a..000000000 --- a/llama_stack/templates/tgi/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .tgi import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml deleted file mode 100644 index 3ac3968e8..000000000 --- a/llama_stack/templates/tgi/build.yaml +++ /dev/null @@ -1,35 +0,0 @@ -version: 2 -distribution_spec: - description: Use (an external) TGI server for running LLM inference - providers: - inference: - - remote::tgi - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/tgi/doc_template.md b/llama_stack/templates/tgi/doc_template.md deleted file mode 100644 index 68b475893..000000000 --- a/llama_stack/templates/tgi/doc_template.md +++ /dev/null @@ -1,137 +0,0 @@ ---- -orphan: true ---- - -# TGI Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -You can use this distribution if you have GPUs and want to run an independent TGI server container for running inference. - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - - -## Setting up TGI server - -Please check the [TGI Getting Started Guide](https://github.com/huggingface/text-generation-inference?tab=readme-ov-file#get-started) to get a TGI endpoint. Here is a sample script to start a TGI server locally via Docker: - -```bash -export INFERENCE_PORT=8080 -export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct -export CUDA_VISIBLE_DEVICES=0 - -docker run --rm -it \ - --pull always \ - -v $HOME/.cache/huggingface:/data \ - -p $INFERENCE_PORT:$INFERENCE_PORT \ - --gpus $CUDA_VISIBLE_DEVICES \ - ghcr.io/huggingface/text-generation-inference:2.3.1 \ - --dtype bfloat16 \ - --usage-stats off \ - --sharded false \ - --cuda-memory-fraction 0.7 \ - --model-id $INFERENCE_MODEL \ - --port $INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like: - -```bash -export SAFETY_PORT=8081 -export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B -export CUDA_VISIBLE_DEVICES=1 - -docker run --rm -it \ - --pull always \ - -v $HOME/.cache/huggingface:/data \ - -p $SAFETY_PORT:$SAFETY_PORT \ - --gpus $CUDA_VISIBLE_DEVICES \ - ghcr.io/huggingface/text-generation-inference:2.3.1 \ - --dtype bfloat16 \ - --usage-stats off \ - --sharded false \ - --model-id $SAFETY_MODEL \ - --port $SAFETY_PORT -``` - -## Running Llama Stack - -Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -# You need a local checkout of llama-stack to run this, get it using -# git clone https://github.com/meta-llama/llama-stack.git -cd /path/to/llama-stack - -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ~/.llama:/root/.llama \ - -v ./llama_stack/templates/tgi/run-with-safety.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://host.docker.internal:$INFERENCE_PORT \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env TGI_SAFETY_URL=http://host.docker.internal:$SAFETY_PORT -``` - -### Via Conda - -Make sure you have done `uv pip install llama-stack` and have the Llama Stack CLI available. - -```bash -llama stack build --template {{ name }} --image-type conda -llama stack run ./run.yaml - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT -``` - -If you are using Llama Stack Safety / Shield APIs, use: - -```bash -llama stack run ./run-with-safety.yaml \ - --port $LLAMA_STACK_PORT \ - --env INFERENCE_MODEL=$INFERENCE_MODEL \ - --env TGI_URL=http://127.0.0.1:$INFERENCE_PORT \ - --env SAFETY_MODEL=$SAFETY_MODEL \ - --env TGI_SAFETY_URL=http://127.0.0.1:$SAFETY_PORT -``` diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml deleted file mode 100644 index 63da62a03..000000000 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ /dev/null @@ -1,132 +0,0 @@ -version: 2 -image_name: tgi -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: tgi-inference - provider_type: remote::tgi - config: - url: ${env.TGI_URL} - - provider_id: tgi-safety - provider_type: remote::tgi - config: - url: ${env.TGI_SAFETY_URL} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: tgi-inference - model_type: llm -- metadata: {} - model_id: ${env.SAFETY_MODEL} - provider_id: tgi-safety - model_type: llm -shields: -- shield_id: ${env.SAFETY_MODEL} -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml deleted file mode 100644 index 430494121..000000000 --- a/llama_stack/templates/tgi/run.yaml +++ /dev/null @@ -1,131 +0,0 @@ -version: 2 -image_name: tgi -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: tgi-inference - provider_type: remote::tgi - config: - url: ${env.TGI_URL} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/tgi}/inference_store.db -models: -- metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: tgi-inference - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: [] -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -server: - port: 8321 diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py deleted file mode 100644 index 394cde18e..000000000 --- a/llama_stack/templates/tgi/tgi.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.tgi import TGIImplConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::tgi", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - ], - } - name = "tgi" - inference_provider = Provider( - provider_id="tgi-inference", - provider_type="remote::tgi", - config=TGIImplConfig.sample_run_config( - url="${env.TGI_URL}", - ), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - - inference_model = ModelInput( - model_id="${env.INFERENCE_MODEL}", - provider_id="tgi-inference", - ) - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - safety_model = ModelInput( - model_id="${env.SAFETY_MODEL}", - provider_id="tgi-safety", - ) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ] - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use (an external) TGI server for running LLM inference", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=[inference_model, embedding_model], - default_tool_groups=default_tool_groups, - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - Provider( - provider_id="tgi-safety", - provider_type="remote::tgi", - config=TGIImplConfig.sample_run_config( - url="${env.TGI_SAFETY_URL}", - ), - ), - ], - "vector_io": [vector_io_provider], - }, - default_models=[ - inference_model, - safety_model, - ], - default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "INFERENCE_MODEL": ( - "meta-llama/Llama-3.2-3B-Instruct", - "Inference model loaded into the TGI server", - ), - "TGI_URL": ( - "http://127.0.0.1:8080/v1", - "URL of the TGI server with the main inference model", - ), - "TGI_SAFETY_URL": ( - "http://127.0.0.1:8081/v1", - "URL of the TGI server with the safety model", - ), - "SAFETY_MODEL": ( - "meta-llama/Llama-Guard-3-1B", - "Name of the safety (Llama-Guard) model to use", - ), - }, - ) diff --git a/llama_stack/templates/together/__init__.py b/llama_stack/templates/together/__init__.py deleted file mode 100644 index 757995b6b..000000000 --- a/llama_stack/templates/together/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .together import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml deleted file mode 100644 index 518a843da..000000000 --- a/llama_stack/templates/together/build.yaml +++ /dev/null @@ -1,36 +0,0 @@ -version: 2 -distribution_spec: - description: Use Together.AI for running LLM inference - providers: - inference: - - remote::together - - inline::sentence-transformers - vector_io: - - inline::faiss - - remote::chromadb - - remote::pgvector - safety: - - inline::llama-guard - agents: - - inline::meta-reference - telemetry: - - inline::meta-reference - eval: - - inline::meta-reference - datasetio: - - remote::huggingface - - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust - tool_runtime: - - remote::brave-search - - remote::tavily-search - - inline::rag-runtime - - remote::model-context-protocol - - remote::wolfram-alpha -image_type: conda -additional_pip_packages: -- aiosqlite -- sqlalchemy[asyncio] diff --git a/llama_stack/templates/together/doc_template.md b/llama_stack/templates/together/doc_template.md deleted file mode 100644 index 5a01595c4..000000000 --- a/llama_stack/templates/together/doc_template.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -orphan: true ---- -# Together Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -### Prerequisite: API Keys - -Make sure you have access to a Together API Key. You can get one by visiting [together.xyz](https://together.xyz/). - - -## Running Llama Stack with Together - -You can do this via Conda (build code) or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=8321 -docker run \ - -it \ - --pull always \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - llamastack/distribution-{{ name }} \ - --port $LLAMA_STACK_PORT \ - --env TOGETHER_API_KEY=$TOGETHER_API_KEY -``` - -### Via Conda - -```bash -llama stack build --template {{ name }} --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env TOGETHER_API_KEY=$TOGETHER_API_KEY -``` diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml deleted file mode 100644 index 7ae2a1d1a..000000000 --- a/llama_stack/templates/together/run-with-safety.yaml +++ /dev/null @@ -1,279 +0,0 @@ -version: 2 -image_name: together -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: together - provider_type: remote::together - config: - url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY:+} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: {} - - provider_id: llama-guard-vision - provider_type: inline::llama-guard - config: {} - - provider_id: code-scanner - provider_type: inline::code-scanner - config: {} - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/inference_store.db -models: -- metadata: {} - model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Meta-Llama-Guard-3-8B - provider_id: together - provider_model_id: meta-llama/Meta-Llama-Guard-3-8B - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-8B - provider_id: together - provider_model_id: meta-llama/Meta-Llama-Guard-3-8B - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: together - provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo - model_type: llm -- metadata: - embedding_dimension: 768 - context_length: 8192 - model_id: togethercomputer/m2-bert-80M-8k-retrieval - provider_id: together - provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval - model_type: embedding -- metadata: - embedding_dimension: 768 - context_length: 32768 - model_id: togethercomputer/m2-bert-80M-32k-retrieval - provider_id: together - provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval - model_type: embedding -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - provider_id: together - provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: {} - model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - provider_id: together - provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B - provider_id: llama-guard -- shield_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: llama-guard-vision -- shield_id: CodeScanner - provider_id: code-scanner -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -server: - port: 8321 diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml deleted file mode 100644 index dc09aeac9..000000000 --- a/llama_stack/templates/together/run.yaml +++ /dev/null @@ -1,269 +0,0 @@ -version: 2 -image_name: together -apis: -- agents -- datasetio -- eval -- inference -- safety -- scoring -- telemetry -- tool_runtime -- vector_io -providers: - inference: - - provider_id: together - provider_type: remote::together - config: - url: https://api.together.xyz/v1 - api_key: ${env.TOGETHER_API_KEY:+} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: faiss - provider_type: inline::faiss - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/faiss_store.db - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/agents_store.db - responses_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/responses_store.db - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console,sqlite} - sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/trace_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/meta_reference_eval.db - datasetio: - - provider_id: huggingface - provider_type: remote::huggingface - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/huggingface_datasetio.db - - provider_id: localfs - provider_type: inline::localfs - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:+} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: wolfram-alpha - provider_type: remote::wolfram-alpha - config: - api_key: ${env.WOLFRAM_ALPHA_API_KEY:+} -metadata_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/registry.db -inference_store: - type: sqlite - db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/together}/inference_store.db -models: -- metadata: {} - model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-8B-Instruct - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-70B-Instruct - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.1-405B-Instruct-FP8 - provider_id: together - provider_model_id: meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-3B-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.2-3B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-11B-Vision-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.2-90B-Vision-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-3.3-70B-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-3.3-70B-Instruct-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Meta-Llama-Guard-3-8B - provider_id: together - provider_model_id: meta-llama/Meta-Llama-Guard-3-8B - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-8B - provider_id: together - provider_model_id: meta-llama/Meta-Llama-Guard-3-8B - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo - provider_id: together - provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-Guard-3-11B-Vision - provider_id: together - provider_model_id: meta-llama/Llama-Guard-3-11B-Vision-Turbo - model_type: llm -- metadata: - embedding_dimension: 768 - context_length: 8192 - model_id: togethercomputer/m2-bert-80M-8k-retrieval - provider_id: together - provider_model_id: togethercomputer/m2-bert-80M-8k-retrieval - model_type: embedding -- metadata: - embedding_dimension: 768 - context_length: 32768 - model_id: togethercomputer/m2-bert-80M-32k-retrieval - provider_id: together - provider_model_id: togethercomputer/m2-bert-80M-32k-retrieval - model_type: embedding -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: together/meta-llama/Llama-4-Scout-17B-16E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Scout-17B-16E-Instruct - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - provider_id: together - provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: {} - model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct - provider_id: together - provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: {} - model_id: together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - provider_id: together - provider_model_id: meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 - model_type: llm -- metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding -shields: -- shield_id: meta-llama/Llama-Guard-3-8B -vector_dbs: [] -datasets: [] -scoring_fns: [] -benchmarks: [] -tool_groups: -- toolgroup_id: builtin::websearch - provider_id: tavily-search -- toolgroup_id: builtin::rag - provider_id: rag-runtime -- toolgroup_id: builtin::wolfram_alpha - provider_id: wolfram-alpha -server: - port: 8321 diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py deleted file mode 100644 index 4c64ff3cd..000000000 --- a/llama_stack/templates/together/together.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from pathlib import Path - -from llama_stack.apis.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ShieldInput, - ToolGroupInput, -) -from llama_stack.providers.inline.inference.sentence_transformers import ( - SentenceTransformersInferenceConfig, -) -from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig -from llama_stack.providers.remote.inference.together import TogetherImplConfig -from llama_stack.providers.remote.inference.together.models import MODEL_ENTRIES -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, - get_model_registry, -) - - -def get_distribution_template() -> DistributionTemplate: - providers = { - "inference": ["remote::together", "inline::sentence-transformers"], - "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], - "safety": ["inline::llama-guard"], - "agents": ["inline::meta-reference"], - "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], - "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], - "tool_runtime": [ - "remote::brave-search", - "remote::tavily-search", - "inline::rag-runtime", - "remote::model-context-protocol", - "remote::wolfram-alpha", - ], - } - name = "together" - inference_provider = Provider( - provider_id="together", - provider_type="remote::together", - config=TogetherImplConfig.sample_run_config(), - ) - vector_io_provider = Provider( - provider_id="faiss", - provider_type="inline::faiss", - config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"), - ) - embedding_provider = Provider( - provider_id="sentence-transformers", - provider_type="inline::sentence-transformers", - config=SentenceTransformersInferenceConfig.sample_run_config(), - ) - available_models = { - "together": MODEL_ENTRIES, - } - default_models = get_model_registry(available_models) - default_tool_groups = [ - ToolGroupInput( - toolgroup_id="builtin::websearch", - provider_id="tavily-search", - ), - ToolGroupInput( - toolgroup_id="builtin::rag", - provider_id="rag-runtime", - ), - ToolGroupInput( - toolgroup_id="builtin::wolfram_alpha", - provider_id="wolfram-alpha", - ), - ] - embedding_model = ModelInput( - model_id="all-MiniLM-L6-v2", - provider_id="sentence-transformers", - model_type=ModelType.embedding, - metadata={ - "embedding_dimension": 384, - }, - ) - - return DistributionTemplate( - name=name, - distro_type="self_hosted", - description="Use Together.AI for running LLM inference", - container_image=None, - template_path=Path(__file__).parent / "doc_template.md", - providers=providers, - available_models_by_provider=available_models, - run_configs={ - "run.yaml": RunConfigSettings( - provider_overrides={ - "inference": [inference_provider, embedding_provider], - "vector_io": [vector_io_provider], - }, - default_models=default_models + [embedding_model], - default_tool_groups=default_tool_groups, - default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], - ), - "run-with-safety.yaml": RunConfigSettings( - provider_overrides={ - "inference": [ - inference_provider, - embedding_provider, - ], - "vector_io": [vector_io_provider], - "safety": [ - Provider( - provider_id="llama-guard", - provider_type="inline::llama-guard", - config={}, - ), - Provider( - provider_id="llama-guard-vision", - provider_type="inline::llama-guard", - config={}, - ), - Provider( - provider_id="code-scanner", - provider_type="inline::code-scanner", - config={}, - ), - ], - }, - default_models=[ - *default_models, - embedding_model, - ], - default_shields=[ - ShieldInput( - shield_id="meta-llama/Llama-Guard-3-8B", - provider_id="llama-guard", - ), - ShieldInput( - shield_id="meta-llama/Llama-Guard-3-11B-Vision", - provider_id="llama-guard-vision", - ), - ShieldInput( - shield_id="CodeScanner", - provider_id="code-scanner", - ), - ], - default_tool_groups=default_tool_groups, - ), - }, - run_config_env_vars={ - "LLAMA_STACK_PORT": ( - "8321", - "Port for the Llama Stack distribution server", - ), - "TOGETHER_API_KEY": ( - "", - "Together.AI API Key", - ), - }, - ) diff --git a/llama_stack/templates/watsonx/__init__.py b/llama_stack/templates/watsonx/__init__.py index 078d86144..756f351d8 100644 --- a/llama_stack/templates/watsonx/__init__.py +++ b/llama_stack/templates/watsonx/__init__.py @@ -3,5 +3,3 @@ # # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. - -from .watsonx import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/watsonx/doc_template.md b/llama_stack/templates/watsonx/doc_template.md deleted file mode 100644 index f28dbf0bf..000000000 --- a/llama_stack/templates/watsonx/doc_template.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -orphan: true ---- -# watsonx Distribution - -```{toctree} -:maxdepth: 2 -:hidden: - -self -``` - -The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations. - -{{ providers_table }} - -{% if run_config_env_vars %} - -### Environment Variables - -The following environment variables can be configured: - -{% for var, (default_value, description) in run_config_env_vars.items() %} -- `{{ var }}`: {{ description }} (default: `{{ default_value }}`) -{% endfor %} -{% endif %} - -{% if default_models %} -### Models - -The following models are available by default: - -{% for model in default_models %} -- `{{ model.model_id }} {{ model.doc_string }}` -{% endfor %} -{% endif %} - - -### Prerequisite: API Keys - -Make sure you have access to a watsonx API Key. You can get one by referring [watsonx.ai](https://www.ibm.com/docs/en/masv-and-l/maximo-manage/continuous-delivery?topic=setup-create-watsonx-api-key). - - -## Running Llama Stack with watsonx - -You can do this via Conda (build code), venv or Docker which has a pre-built image. - -### Via Docker - -This method allows you to get started quickly without having to build the distribution code. - -```bash -LLAMA_STACK_PORT=5001 -docker run \ - -it \ - -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ - -v ./run.yaml:/root/my-run.yaml \ - llamastack/distribution-{{ name }} \ - --config /root/my-run.yaml \ - --port $LLAMA_STACK_PORT \ - --env WATSONX_API_KEY=$WATSONX_API_KEY \ - --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID \ - --env WATSONX_BASE_URL=$WATSONX_BASE_URL -``` - -### Via Conda - -```bash -llama stack build --template watsonx --image-type conda -llama stack run ./run.yaml \ - --port $LLAMA_STACK_PORT \ - --env WATSONX_API_KEY=$WATSONX_API_KEY \ - --env WATSONX_PROJECT_ID=$WATSONX_PROJECT_ID -``` diff --git a/tests/integration/README.md b/tests/integration/README.md index 31d58c83f..3d3aa3d77 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -11,7 +11,7 @@ pytest --help Here are the most important options: - `--stack-config`: specify the stack config to use. You have three ways to point to a stack: - a URL which points to a Llama Stack distribution server - - a template (e.g., `fireworks`, `together`) or a path to a `run.yaml` file + - a template (e.g., `starter`) or a path to a `run.yaml` file - a comma-separated list of api=provider pairs, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference`. This is most useful for testing a single API surface. - `--env`: set environment variables, e.g. --env KEY=value. this is a utility option to set environment variables required by various providers. @@ -32,28 +32,29 @@ Experimental, under development, options: ## Examples -Run all text inference tests with the `together` distribution: +Run all text inference tests with the `starter` distribution using the `together` provider: ```bash -pytest -s -v tests/integration/inference/test_text_inference.py \ - --stack-config=together \ +ENABLE_TOGETHER=together pytest -s -v tests/integration/inference/test_text_inference.py \ + --stack-config=starter \ --text-model=meta-llama/Llama-3.1-8B-Instruct ``` -Run all text inference tests with the `together` distribution and `meta-llama/Llama-3.1-8B-Instruct`: +Run all text inference tests with the `starter` distribution using the `together` provider and `meta-llama/Llama-3.1-8B-Instruct`: ```bash -pytest -s -v tests/integration/inference/test_text_inference.py \ - --stack-config=together \ +ENABLE_TOGETHER=together pytest -s -v tests/integration/inference/test_text_inference.py \ + --stack-config=starter \ --text-model=meta-llama/Llama-3.1-8B-Instruct ``` -Running all inference tests for a number of models: +Running all inference tests for a number of models using the `together` provider: ```bash TEXT_MODELS=meta-llama/Llama-3.1-8B-Instruct,meta-llama/Llama-3.1-70B-Instruct VISION_MODELS=meta-llama/Llama-3.2-11B-Vision-Instruct EMBEDDING_MODELS=all-MiniLM-L6-v2 +ENABLE_TOGETHER=together export TOGETHER_API_KEY= pytest -s -v tests/integration/inference/ \ diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index fa96688c0..daf80059c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -65,7 +65,7 @@ def pytest_addoption(parser): help=textwrap.dedent( """ a 'pointer' to the stack. this can be either be: - (a) a template name like `fireworks`, or + (a) a template name like `starter`, or (b) a path to a run.yaml file, or (c) an adhoc config spec, e.g. `inference=fireworks,safety=llama-guard,agents=meta-reference` """ diff --git a/tests/integration/fixtures/common.py b/tests/integration/fixtures/common.py index 8b6b3ddbe..9a734e8a5 100644 --- a/tests/integration/fixtures/common.py +++ b/tests/integration/fixtures/common.py @@ -7,6 +7,7 @@ import inspect import os import tempfile +from urllib.parse import urlparse import pytest import yaml @@ -122,12 +123,17 @@ def llama_stack_client(request, provider_data): if not config: raise ValueError("You must specify either --stack-config or LLAMA_STACK_CONFIG") - # check if this looks like a URL - if config.startswith("http") or "//" in config: - return LlamaStackClient( - base_url=config, - provider_data=provider_data, - ) + # check if this looks like a URL using proper URL parsing + try: + parsed_url = urlparse(config) + if parsed_url.scheme and parsed_url.netloc: + return LlamaStackClient( + base_url=config, + provider_data=provider_data, + ) + except Exception: + # If URL parsing fails, treat as non-URL config + pass if "=" in config: run_config = run_config_from_adhoc_config_spec(config) diff --git a/tests/integration/inference/test_openai_completion.py b/tests/integration/inference/test_openai_completion.py index 3e43af272..05aee5096 100644 --- a/tests/integration/inference/test_openai_completion.py +++ b/tests/integration/inference/test_openai_completion.py @@ -45,7 +45,7 @@ def skip_if_model_doesnt_support_suffix(client_with_models, model_id): # To test `fim` ( fill in the middle ) completion, we need to use a model that supports suffix. # Use this to specifically test this API functionality. - # pytest -sv --stack-config="inference=ollama" \ + # pytest -sv --stack-config="inference=starter" \ # tests/integration/inference/test_openai_completion.py \ # --text-model qwen2.5-coder:1.5b \ # -k test_openai_completion_non_streaming_suffix