From 7301403ce38ae3c3309199602f7cd3472a9238b8 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Thu, 5 Dec 2024 16:29:32 -0800 Subject: [PATCH] Add eval/scoring/datasetio API providers to distribution templates & UI developer guide (#564) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # What does this PR do? - add /eval, /scoring, /datasetio API providers to distribution templates - regenerate build.yaml / run.yaml files - fix `template.py` to take in list of providers instead of only first one - override memory provider as faiss default for all distro (as only 1 memory provider is needed to start basic flow, chromadb/pgvector need additional setup step). ``` python llama_stack/scripts/distro_codegen.py ``` - updated README to start UI via conda builds. ## Test Plan ``` python llama_stack/scripts/distro_codegen.py ``` - Use newly generated `run.yaml` to start server ``` llama stack run ./llama_stack/templates/together/run.yaml ``` image #### Registration ``` ❯ llama-stack-client datasets register \ --dataset-id "mmlu" \ --provider-id "huggingface" \ --url "https://huggingface.co/datasets/llamastack/evals" \ --metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \ --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' ❯ llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ │ mmlu │ huggingface │ {'path': 'llamastack/evals', 'name': │ dataset │ │ │ │ 'evals__mmlu__details', 'split': │ │ │ │ │ 'train'} │ │ └────────────┴─────────────┴─────────────────────────────────────────┴─────────┘ ``` ``` ❯ llama-stack-client datasets register \ --dataset-id "simpleqa" \ --provider-id "huggingface" \ --url "https://huggingface.co/datasets/llamastack/evals" \ --metadata '{"path": "llamastack/evals", "name": "evals__simpleqa", "split": "train"}' \ --schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' ❯ llama-stack-client datasets list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┓ ┃ identifier ┃ provider_id ┃ metadata ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━┩ │ mmlu │ huggingface │ {'path': 'llamastack/evals', 'name': 'evals__mmlu__details', │ dataset │ │ │ │ 'split': 'train'} │ │ │ simpleqa │ huggingface │ {'path': 'llamastack/evals', 'name': 'evals__simpleqa', │ dataset │ │ │ │ 'split': 'train'} │ │ └────────────┴─────────────┴───────────────────────────────────────────────────────────────┴─────────┘ ``` ``` ❯ llama-stack-client eval_tasks register \ > --eval-task-id meta-reference-mmlu \ > --provider-id meta-reference \ > --dataset-id mmlu \ > --scoring-functions basic::regex_parser_multiple_choice_answer ❯ llama-stack-client eval_tasks register \ --eval-task-id meta-reference-simpleqa \ --provider-id meta-reference \ --dataset-id simpleqa \ --scoring-functions llm-as-judge::405b-simpleqa ❯ llama-stack-client eval_tasks list ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ dataset_id ┃ identifier ┃ metadata ┃ provider_id ┃ provider_resour… ┃ scoring_functio… ┃ type ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ │ mmlu │ meta-reference-… │ {} │ meta-reference │ meta-reference-… │ ['basic::regex_… │ eval_task │ │ simpleqa │ meta-reference-… │ {} │ meta-reference │ meta-reference-… │ ['llm-as-judge:… │ eval_task │ └────────────┴──────────────────┴──────────┴────────────────┴──────────────────┴──────────────────┴───────────┘ ``` #### Test with UI ``` streamlit run app.py ``` ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Ran pre-commit to handle lint / formatting issues. - [ ] Read the [contributor guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md), Pull Request section? - [ ] Updated relevant documentation. - [ ] Wrote necessary unit or integration tests. --- distributions/dependencies.json | 290 ++++++++++-------- .../self_hosted_distro/bedrock.md | 6 +- .../self_hosted_distro/fireworks.md | 3 + .../self_hosted_distro/meta-reference-gpu.md | 3 + .../meta-reference-quantized-gpu.md | 3 + .../self_hosted_distro/ollama.md | 5 +- .../distributions/self_hosted_distro/tgi.md | 3 + .../self_hosted_distro/together.md | 3 + llama_stack/distribution/ui/README.md | 41 ++- llama_stack/templates/bedrock/bedrock.py | 20 +- llama_stack/templates/bedrock/build.yaml | 9 + llama_stack/templates/bedrock/run.yaml | 24 ++ llama_stack/templates/fireworks/build.yaml | 9 + llama_stack/templates/fireworks/fireworks.py | 14 +- llama_stack/templates/fireworks/run.yaml | 24 ++ llama_stack/templates/hf-endpoint/build.yaml | 9 + .../templates/hf-endpoint/hf_endpoint.py | 17 +- .../hf-endpoint/run-with-safety.yaml | 24 ++ llama_stack/templates/hf-endpoint/run.yaml | 24 ++ .../templates/hf-serverless/build.yaml | 9 + .../templates/hf-serverless/hf_serverless.py | 16 +- .../hf-serverless/run-with-safety.yaml | 24 ++ llama_stack/templates/hf-serverless/run.yaml | 24 ++ .../templates/meta-reference-gpu/build.yaml | 9 + .../meta-reference-gpu/meta_reference.py | 15 +- .../meta-reference-gpu/run-with-safety.yaml | 24 ++ .../templates/meta-reference-gpu/run.yaml | 24 ++ .../meta-reference-quantized-gpu/build.yaml | 9 + .../meta_reference.py | 14 +- .../meta-reference-quantized-gpu/run.yaml | 24 ++ llama_stack/templates/ollama/build.yaml | 9 + llama_stack/templates/ollama/doc_template.md | 6 +- llama_stack/templates/ollama/ollama.py | 17 +- .../templates/ollama/run-with-safety.yaml | 24 ++ llama_stack/templates/ollama/run.yaml | 24 ++ llama_stack/templates/remote-vllm/vllm.py | 12 +- llama_stack/templates/template.py | 55 ++-- llama_stack/templates/tgi/build.yaml | 9 + .../templates/tgi/run-with-safety.yaml | 24 ++ llama_stack/templates/tgi/run.yaml | 24 ++ llama_stack/templates/tgi/tgi.py | 15 +- llama_stack/templates/together/build.yaml | 9 + llama_stack/templates/together/run.yaml | 24 ++ llama_stack/templates/together/together.py | 14 +- llama_stack/templates/vllm-gpu/build.yaml | 9 + llama_stack/templates/vllm-gpu/run.yaml | 24 ++ llama_stack/templates/vllm-gpu/vllm.py | 14 +- 47 files changed, 841 insertions(+), 195 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 80468cc73..4e66a85da 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,10 +1,12 @@ { - "tgi": [ + "hf-serverless": [ "aiohttp", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", @@ -13,6 +15,7 @@ "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -27,6 +30,66 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], + "together": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "together", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "vllm-gpu": [ + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "vllm", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], "remote-vllm": [ "aiosqlite", "blobfile", @@ -54,18 +117,22 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "vllm-gpu": [ + "fireworks": [ "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", + "fireworks-ai", "httpx", "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -77,82 +144,17 @@ "tqdm", "transformers", "uvicorn", - "vllm", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "meta-reference-quantized-gpu": [ - "accelerate", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "fairscale", - "faiss-cpu", - "fastapi", - "fbgemm-gpu", - "fire", - "httpx", - "lm-format-enforcer", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "torch", - "torchao==0.5.0", - "torchvision", - "tqdm", - "transformers", - "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "meta-reference-gpu": [ - "accelerate", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "fairscale", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "lm-format-enforcer", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "torch", - "torchvision", - "tqdm", - "transformers", - "uvicorn", - "zmq", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "hf-serverless": [ + "tgi": [ "aiohttp", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", @@ -161,61 +163,7 @@ "matplotlib", "nltk", "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "together": [ - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "pandas", - "pillow", - "psycopg2-binary", - "pypdf", - "redis", - "scikit-learn", - "scipy", - "sentencepiece", - "together", - "tqdm", - "transformers", - "uvicorn", - "sentence-transformers --no-deps", - "torch --index-url https://download.pytorch.org/whl/cpu" - ], - "ollama": [ - "aiohttp", - "aiosqlite", - "blobfile", - "chardet", - "chromadb-client", - "faiss-cpu", - "fastapi", - "fire", - "httpx", - "matplotlib", - "nltk", - "numpy", - "ollama", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -232,10 +180,12 @@ ], "bedrock": [ "aiosqlite", + "autoevals", "blobfile", "boto3", "chardet", "chromadb-client", + "datasets", "faiss-cpu", "fastapi", "fire", @@ -243,6 +193,7 @@ "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -257,20 +208,24 @@ "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "hf-endpoint": [ - "aiohttp", + "meta-reference-gpu": [ + "accelerate", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", + "fairscale", "faiss-cpu", "fastapi", "fire", "httpx", - "huggingface_hub", + "lm-format-enforcer", "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -279,25 +234,34 @@ "scikit-learn", "scipy", "sentencepiece", + "torch", + "torchvision", "tqdm", "transformers", "uvicorn", + "zmq", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], - "fireworks": [ + "meta-reference-quantized-gpu": [ + "accelerate", "aiosqlite", + "autoevals", "blobfile", "chardet", "chromadb-client", + "datasets", + "fairscale", "faiss-cpu", "fastapi", + "fbgemm-gpu", "fire", - "fireworks-ai", "httpx", + "lm-format-enforcer", "matplotlib", "nltk", "numpy", + "openai", "pandas", "pillow", "psycopg2-binary", @@ -306,9 +270,13 @@ "scikit-learn", "scipy", "sentencepiece", + "torch", + "torchao==0.5.0", + "torchvision", "tqdm", "transformers", "uvicorn", + "zmq", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" ], @@ -337,5 +305,67 @@ "uvicorn", "sentence-transformers --no-deps", "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "ollama": [ + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "matplotlib", + "nltk", + "numpy", + "ollama", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" + ], + "hf-endpoint": [ + "aiohttp", + "aiosqlite", + "autoevals", + "blobfile", + "chardet", + "chromadb-client", + "datasets", + "faiss-cpu", + "fastapi", + "fire", + "httpx", + "huggingface_hub", + "matplotlib", + "nltk", + "numpy", + "openai", + "pandas", + "pillow", + "psycopg2-binary", + "pypdf", + "redis", + "scikit-learn", + "scipy", + "sentencepiece", + "tqdm", + "transformers", + "uvicorn", + "sentence-transformers --no-deps", + "torch --index-url https://download.pytorch.org/whl/cpu" ] } diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md index e0a5d80d0..ae03c89da 100644 --- a/docs/source/distributions/self_hosted_distro/bedrock.md +++ b/docs/source/distributions/self_hosted_distro/bedrock.md @@ -1,6 +1,3 @@ ---- -orphan: true ---- # Bedrock Distribution ```{toctree} @@ -15,9 +12,12 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::bedrock` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `remote::bedrock` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index e54302c2e..06a12cb1d 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -15,9 +15,12 @@ The `llamastack/distribution-fireworks` distribution consists of the following p | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::fireworks` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index f9717894f..73d6befd4 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `inline::meta-reference` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index 3ca161d07..fab9c6cd8 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -15,9 +15,12 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `inline::meta-reference-quantized` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 9f81d9329..c915a7ac3 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -15,9 +15,12 @@ The `llamastack/distribution-ollama` distribution consists of the following prov | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::ollama` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | @@ -119,7 +122,7 @@ llama stack run ./run-with-safety.yaml \ ### (Optional) Update Model Serving Configuration ```{note} -Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) variable for supported Ollama models. +Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models. ``` To serve a new model with `ollama` diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index 59485226e..84b91da38 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -16,9 +16,12 @@ The `llamastack/distribution-tgi` distribution consists of the following provide | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::tgi` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index 5cfc9e805..c458fdb5f 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -15,9 +15,12 @@ The `llamastack/distribution-together` distribution consists of the following pr | API | Provider(s) | |-----|-------------| | agents | `inline::meta-reference` | +| datasetio | `remote::huggingface`, `inline::localfs` | +| eval | `inline::meta-reference` | | inference | `remote::together` | | memory | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | | safety | `inline::llama-guard` | +| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | diff --git a/llama_stack/distribution/ui/README.md b/llama_stack/distribution/ui/README.md index 2cc352c52..c0a2597af 100644 --- a/llama_stack/distribution/ui/README.md +++ b/llama_stack/distribution/ui/README.md @@ -1,16 +1,41 @@ -# LLama Stack UI +# (Experimental) LLama Stack UI -[!NOTE] This is a work in progress. +## Docker Setup -## Prerequisite -- Start up Llama Stack Server -``` -llama stack run -``` +:warning: This is a work in progress. -## Running Streamlit App +## Developer Setup + +1. Start up Llama Stack API server. More details [here](https://llama-stack.readthedocs.io/en/latest/getting_started/index.html). ``` +llama stack build --template together --image-type conda + +llama stack run together +``` + +2. (Optional) Register datasets and eval tasks as resources. If you want to run pre-configured evaluation flows (e.g. Evaluations (Generation + Scoring) Page). + +```bash +$ llama-stack-client datasets register \ +--dataset-id "mmlu" \ +--provider-id "huggingface" \ +--url "https://huggingface.co/datasets/llamastack/evals" \ +--metadata '{"path": "llamastack/evals", "name": "evals__mmlu__details", "split": "train"}' \ +--schema '{"input_query": {"type": "string"}, "expected_answer": {"type": "string", "chat_completion_input": {"type": "string"}}}' +``` + +```bash +$ llama-stack-client eval_tasks register \ +--eval-task-id meta-reference-mmlu \ +--provider-id meta-reference \ +--dataset-id mmlu \ +--scoring-functions basic::regex_parser_multiple_choice_answer +``` + +3. Start Streamlit UI + +```bash cd llama_stack/distribution/ui pip install -r requirements.txt streamlit run app.py diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py index cf3c342fe..c52b56612 100644 --- a/llama_stack/templates/bedrock/bedrock.py +++ b/llama_stack/templates/bedrock/bedrock.py @@ -6,6 +6,9 @@ from pathlib import Path +from llama_stack.distribution.datatypes import Provider + +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,10 +19,19 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["remote::bedrock"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } + name = "bedrock" + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) return DistributionTemplate( - name="bedrock", + name=name, distro_type="self_hosted", description="Use AWS Bedrock for running LLM inference and safety", docker_image=None, @@ -27,7 +39,11 @@ def get_distribution_template() -> DistributionTemplate: providers=providers, default_models=[], run_configs={ - "run.yaml": RunConfigSettings(), + "run.yaml": RunConfigSettings( + provider_overrides={ + "memory": [memory_provider], + }, + ), }, run_config_env_vars={ "LLAMASTACK_PORT": ( diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml index c73db3eae..cd36c320e 100644 --- a/llama_stack/templates/bedrock/build.yaml +++ b/llama_stack/templates/bedrock/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 1f632a1f2..77d4f2248 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: bedrock apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -37,6 +40,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml index c16e3f5d6..30ea347ae 100644 --- a/llama_stack/templates/fireworks/build.yaml +++ b/llama_stack/templates/fireworks/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py index 5f744cae0..64387e4b7 100644 --- a/llama_stack/templates/fireworks/fireworks.py +++ b/llama_stack/templates/fireworks/fireworks.py @@ -9,6 +9,7 @@ from pathlib import Path from llama_models.sku_list import all_registered_models from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.fireworks import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks.fireworks import MODEL_ALIASES @@ -22,13 +23,23 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } + name = "fireworks" + inference_provider = Provider( provider_id="fireworks", provider_type="remote::fireworks", config=FireworksImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) core_model_to_hf_repo = { m.descriptor(): m.huggingface_repo for m in all_registered_models() @@ -42,7 +53,7 @@ def get_distribution_template() -> DistributionTemplate: ] return DistributionTemplate( - name="fireworks", + name=name, distro_type="self_hosted", description="Use Fireworks.AI for running LLM inference", docker_image=None, @@ -53,6 +64,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=default_models, default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 6add39c3a..9296be28f 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: fireworks apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml index 798cb3961..523cf5d83 100644 --- a/llama_stack/templates/hf-endpoint/build.yaml +++ b/llama_stack/templates/hf-endpoint/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py index af00114ba..297fdae51 100644 --- a/llama_stack/templates/hf-endpoint/hf_endpoint.py +++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py @@ -5,6 +5,7 @@ # the root directory of this source tree. from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceEndpointImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,13 +17,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "hf-endpoint" inference_provider = Provider( provider_id="hf-endpoint", provider_type="remote::hf::endpoint", config=InferenceEndpointImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -34,7 +43,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="hf-endpoint", + name=name, distro_type="self_hosted", description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", docker_image=None, @@ -45,6 +54,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -59,7 +69,8 @@ def get_distribution_template() -> DistributionTemplate: endpoint_name="${env.SAFETY_INFERENCE_ENDPOINT_NAME}", ), ), - ] + ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index d518f29b8..bd625ffc5 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-endpoint apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -44,6 +47,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index ff4e90606..bf0697bba 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-endpoint apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml index 3c03a98c1..af7eb60fe 100644 --- a/llama_stack/templates/hf-serverless/build.yaml +++ b/llama_stack/templates/hf-serverless/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py index 5434de986..835495bb9 100644 --- a/llama_stack/templates/hf-serverless/hf_serverless.py +++ b/llama_stack/templates/hf-serverless/hf_serverless.py @@ -5,6 +5,7 @@ # the root directory of this source tree. from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import InferenceAPIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,13 +17,22 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } + name = "hf-serverless" inference_provider = Provider( provider_id="hf-serverless", provider_type="remote::hf::serverless", config=InferenceAPIImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -34,7 +44,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="hf-serverless", + name=name, distro_type="self_hosted", description="Use (an external) Hugging Face Inference Endpoint for running LLM inference", docker_image=None, @@ -45,6 +55,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -59,7 +70,8 @@ def get_distribution_template() -> DistributionTemplate: repo="${env.SAFETY_MODEL}", ), ), - ] + ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index e7591bbf0..f5ead14d4 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-serverless apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -44,6 +47,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index d7ec02f6a..13e2d7789 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: hf-serverless apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml index ef075d098..300b75b14 100644 --- a/llama_stack/templates/meta-reference-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-gpu/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py index f254bc920..0aff9f39c 100644 --- a/llama_stack/templates/meta-reference-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py @@ -10,6 +10,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput from llama_stack.providers.inline.inference.meta_reference import ( MetaReferenceInferenceConfig, ) +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -20,8 +21,11 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "meta-reference-gpu" inference_provider = Provider( provider_id="meta-reference-inference", provider_type="inline::meta-reference", @@ -30,6 +34,11 @@ def get_distribution_template() -> DistributionTemplate: checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -41,7 +50,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="meta-reference-gpu", + name=name, distro_type="self_hosted", description="Use Meta Reference for running LLM inference", template_path=Path(__file__).parent / "doc_template.md", @@ -51,6 +60,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -67,6 +77,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index f82e0c938..d0fa05e96 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: meta-reference-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -46,6 +49,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index b125169a3..3675f4a58 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: meta-reference-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -40,6 +43,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml index 961864dac..9d866de18 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py index 1ff5d31d6..1d611ae5f 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py @@ -10,6 +10,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.providers.inline.inference.meta_reference import ( MetaReferenceQuantizedInferenceConfig, ) +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -20,8 +21,11 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "meta-reference-quantized-gpu" inference_provider = Provider( provider_id="meta-reference-inference", provider_type="inline::meta-reference-quantized", @@ -30,13 +34,18 @@ def get_distribution_template() -> DistributionTemplate: checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:null}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", provider_id="meta-reference-inference", ) return DistributionTemplate( - name="meta-reference-quantized-gpu", + name=name, distro_type="self_hosted", description="Use Meta Reference with fp8, int4 quantization for running LLM inference", template_path=Path(__file__).parent / "doc_template.md", @@ -46,6 +55,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index e1104b623..081af0f59 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: meta-reference-quantized-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -42,6 +45,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml index 106449309..a021e4993 100644 --- a/llama_stack/templates/ollama/build.yaml +++ b/llama_stack/templates/ollama/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md index cfefce33d..a75583592 100644 --- a/llama_stack/templates/ollama/doc_template.md +++ b/llama_stack/templates/ollama/doc_template.md @@ -114,9 +114,9 @@ llama stack run ./run-with-safety.yaml \ ### (Optional) Update Model Serving Configuration -> [!NOTE] -> Please check the [OLLAMA_SUPPORTED_MODELS](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers.remote/inference/ollama/ollama.py) for the supported Ollama models. - +```{note} +Please check the [model_aliases](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/inference/ollama/ollama.py#L45) for the supported Ollama models. +``` To serve a new model with `ollama` ```bash diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index b30c75bb5..c24dfa6e9 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.ollama import OllamaImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -18,13 +19,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "ollama" inference_provider = Provider( provider_id="ollama", provider_type="remote::ollama", config=OllamaImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -36,7 +45,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="ollama", + name=name, distro_type="self_hosted", description="Use (an external) Ollama server for running LLM inference", docker_image=None, @@ -47,6 +56,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -54,7 +64,8 @@ def get_distribution_template() -> DistributionTemplate: provider_overrides={ "inference": [ inference_provider, - ] + ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 6c86677b3..dc282f996 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: ollama apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -38,6 +41,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index b2d6f2c18..ab8e12839 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: ollama apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -38,6 +41,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index c3858f7e5..f5ccfcf16 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.vllm import VLLMInferenceAdapterConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -19,7 +20,7 @@ def get_distribution_template() -> DistributionTemplate: "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], } - + name = "remote-vllm" inference_provider = Provider( provider_id="vllm-inference", provider_type="remote::vllm", @@ -27,6 +28,11 @@ def get_distribution_template() -> DistributionTemplate: url="${env.VLLM_URL}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -38,7 +44,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="remote-vllm", + name=name, distro_type="self_hosted", description="Use (an external) vLLM server for running LLM inference", template_path=Path(__file__).parent / "doc_template.md", @@ -48,6 +54,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -63,6 +70,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py index bf74b95d1..e82be6394 100644 --- a/llama_stack/templates/template.py +++ b/llama_stack/templates/template.py @@ -44,36 +44,37 @@ class RunConfigSettings(BaseModel): provider_configs[api_str] = api_providers continue - provider_type = provider_types[0] - provider_id = provider_type.split("::")[-1] + provider_configs[api_str] = [] + for provider_type in provider_types: + provider_id = provider_type.split("::")[-1] - api = Api(api_str) - if provider_type not in provider_registry[api]: - raise ValueError( - f"Unknown provider type: {provider_type} for API: {api_str}" + api = Api(api_str) + if provider_type not in provider_registry[api]: + raise ValueError( + f"Unknown provider type: {provider_type} for API: {api_str}" + ) + + config_class = provider_registry[api][provider_type].config_class + assert ( + config_class is not None + ), f"No config class for provider type: {provider_type} for API: {api_str}" + + config_class = instantiate_class_type(config_class) + if hasattr(config_class, "sample_run_config"): + config = config_class.sample_run_config( + __distro_dir__=f"distributions/{name}" + ) + else: + config = {} + + provider_configs[api_str].append( + Provider( + provider_id=provider_id, + provider_type=provider_type, + config=config, + ) ) - config_class = provider_registry[api][provider_type].config_class - assert ( - config_class is not None - ), f"No config class for provider type: {provider_type} for API: {api_str}" - - config_class = instantiate_class_type(config_class) - if hasattr(config_class, "sample_run_config"): - config = config_class.sample_run_config( - __distro_dir__=f"distributions/{name}" - ) - else: - config = {} - - provider_configs[api_str] = [ - Provider( - provider_id=provider_id, - provider_type=provider_type, - config=config, - ) - ] - # Get unique set of APIs from providers apis = list(sorted(providers.keys())) diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 0f7602e2f..d90b505df 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index ebf082cd6..2ee82ddc3 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: tgi apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -42,6 +45,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index 352afabb5..c45e114ee 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: tgi apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -38,6 +41,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index caa341df3..83818a598 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -7,6 +7,7 @@ from pathlib import Path from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.tgi import TGIImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -18,8 +19,11 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "tgi" inference_provider = Provider( provider_id="tgi-inference", provider_type="remote::tgi", @@ -27,6 +31,11 @@ def get_distribution_template() -> DistributionTemplate: url="${env.TGI_URL}", ), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -38,7 +47,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="tgi", + name=name, distro_type="self_hosted", description="Use (an external) TGI server for running LLM inference", docker_image=None, @@ -49,6 +58,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ), @@ -64,6 +74,7 @@ def get_distribution_template() -> DistributionTemplate: ), ), ], + "memory": [memory_provider], }, default_models=[ inference_model, diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml index a4402ba93..6930b7692 100644 --- a/llama_stack/templates/together/build.yaml +++ b/llama_stack/templates/together/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 855ba0626..a9f96a099 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: together apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -39,6 +42,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py index 16265b04f..6656cfe44 100644 --- a/llama_stack/templates/together/together.py +++ b/llama_stack/templates/together/together.py @@ -9,6 +9,7 @@ from pathlib import Path from llama_models.sku_list import all_registered_models from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.providers.remote.inference.together import TogetherImplConfig from llama_stack.providers.remote.inference.together.together import MODEL_ALIASES @@ -22,13 +23,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "together" inference_provider = Provider( provider_id="together", provider_type="remote::together", config=TogetherImplConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) core_model_to_hf_repo = { m.descriptor(): m.huggingface_repo for m in all_registered_models() @@ -42,7 +51,7 @@ def get_distribution_template() -> DistributionTemplate: ] return DistributionTemplate( - name="together", + name=name, distro_type="self_hosted", description="Use Together.AI for running LLM inference", docker_image=None, @@ -53,6 +62,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=default_models, default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml index 6792a855f..4289296ec 100644 --- a/llama_stack/templates/vllm-gpu/build.yaml +++ b/llama_stack/templates/vllm-gpu/build.yaml @@ -16,4 +16,13 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference + eval: + - inline::meta-reference + datasetio: + - remote::huggingface + - inline::localfs + scoring: + - inline::basic + - inline::llm-as-judge + - inline::braintrust image_type: conda diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index a140ad403..ea188777f 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -4,9 +4,12 @@ docker_image: null conda_env: vllm-gpu apis: - agents +- datasetio +- eval - inference - memory - safety +- scoring - telemetry providers: inference: @@ -42,6 +45,27 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: {} + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: {} + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: {} + - provider_id: localfs + provider_type: inline::localfs + config: {} + scoring: + - provider_id: basic + provider_type: inline::basic + config: {} + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + config: {} + - provider_id: braintrust + provider_type: inline::braintrust + config: {} metadata_store: namespace: null type: sqlite diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py index 78fcf4f57..10b448b5c 100644 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -6,6 +6,7 @@ from llama_stack.distribution.datatypes import ModelInput, Provider from llama_stack.providers.inline.inference.vllm import VLLMConfig +from llama_stack.providers.inline.memory.faiss.config import FaissImplConfig from llama_stack.templates.template import DistributionTemplate, RunConfigSettings @@ -16,13 +17,21 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], + "eval": ["inline::meta-reference"], + "datasetio": ["remote::huggingface", "inline::localfs"], + "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], } - + name = "vllm-gpu" inference_provider = Provider( provider_id="vllm", provider_type="inline::vllm", config=VLLMConfig.sample_run_config(), ) + memory_provider = Provider( + provider_id="faiss", + provider_type="inline::faiss", + config=FaissImplConfig.sample_run_config(f"distributions/{name}"), + ) inference_model = ModelInput( model_id="${env.INFERENCE_MODEL}", @@ -30,7 +39,7 @@ def get_distribution_template() -> DistributionTemplate: ) return DistributionTemplate( - name="vllm-gpu", + name=name, distro_type="self_hosted", description="Use a built-in vLLM engine for running LLM inference", docker_image=None, @@ -41,6 +50,7 @@ def get_distribution_template() -> DistributionTemplate: "run.yaml": RunConfigSettings( provider_overrides={ "inference": [inference_provider], + "memory": [memory_provider], }, default_models=[inference_model], ),