From c1d18283d2a34f209925e7b1f1ebd30435a447ea Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Wed, 19 Mar 2025 11:04:23 -0700 Subject: [PATCH] feat(eval api): (2.2/n) delete eval / scoring / scoring_fn apis (#1700) # What does this PR do? - To make it easier, delete existing `eval/scoring/scoring_function` apis. There will be a bunch of broken impls here. The sequence is: 1. migrate benchmark graders 2. clean up existing scoring functions - Add a skeleton evaluation impl to make tests pass. ## Test Plan tested in following PRs [//]: # (## Documentation) --- distributions/dependencies.json | 54 - docs/_static/llama-stack-spec.html | 1353 +---------------- docs/_static/llama-stack-spec.yaml | 911 +---------- .../remote_hosted_distro/nvidia.md | 2 - .../self_hosted_distro/bedrock.md | 2 - .../self_hosted_distro/cerebras.md | 2 - .../self_hosted_distro/fireworks.md | 2 - .../distributions/self_hosted_distro/groq.md | 2 - .../self_hosted_distro/meta-reference-gpu.md | 2 - .../meta-reference-quantized-gpu.md | 2 - .../self_hosted_distro/ollama.md | 2 - .../self_hosted_distro/passthrough.md | 2 - .../self_hosted_distro/remote-vllm.md | 2 - .../distributions/self_hosted_distro/tgi.md | 2 - .../self_hosted_distro/together.md | 2 - llama_stack/apis/datatypes.py | 4 +- llama_stack/apis/eval/__init__.py | 7 - llama_stack/apis/eval/eval.py | 145 -- llama_stack/apis/scoring/__init__.py | 7 - llama_stack/apis/scoring/scoring.py | 78 - .../apis/scoring_functions/__init__.py | 7 - .../scoring_functions/scoring_functions.py | 149 -- llama_stack/distribution/datatypes.py | 8 - llama_stack/distribution/distribution.py | 12 +- llama_stack/distribution/resolver.py | 30 +- llama_stack/distribution/routers/__init__.py | 6 - llama_stack/distribution/routers/routers.py | 142 -- .../distribution/routers/routing_tables.py | 61 +- llama_stack/distribution/stack.py | 12 - llama_stack/distribution/ui/modules/api.py | 5 +- .../ui/page/distribution/resources.py | 6 +- llama_stack/providers/datatypes.py | 7 - .../inline/eval/meta_reference/__init__.py | 1 - .../inline/eval/meta_reference/eval.py | 5 +- .../evaluation/meta_reference/__init__.py | 27 + .../evaluation/meta_reference/config.py | 26 + .../evaluation/meta_reference/evaluation.py | 71 + llama_stack/providers/registry/eval.py | 28 - llama_stack/providers/registry/evaluation.py | 36 + llama_stack/providers/registry/scoring.py | 49 - .../utils/common/data_schema_validator.py | 46 +- llama_stack/templates/bedrock/bedrock.py | 8 +- llama_stack/templates/bedrock/build.yaml | 6 - llama_stack/templates/bedrock/run.yaml | 22 - llama_stack/templates/cerebras/build.yaml | 6 - llama_stack/templates/cerebras/cerebras.py | 8 +- llama_stack/templates/cerebras/run.yaml | 22 - llama_stack/templates/ci-tests/build.yaml | 6 - llama_stack/templates/ci-tests/ci_tests.py | 12 +- llama_stack/templates/ci-tests/run.yaml | 22 - llama_stack/templates/dell/build.yaml | 6 - llama_stack/templates/dell/dell.py | 2 - .../templates/dell/run-with-safety.yaml | 22 - llama_stack/templates/dell/run.yaml | 22 - llama_stack/templates/dev/build.yaml | 6 - llama_stack/templates/dev/dev.py | 36 +- llama_stack/templates/dev/run.yaml | 22 - llama_stack/templates/fireworks/build.yaml | 6 - llama_stack/templates/fireworks/fireworks.py | 8 +- .../templates/fireworks/run-with-safety.yaml | 22 - llama_stack/templates/fireworks/run.yaml | 22 - llama_stack/templates/groq/build.yaml | 6 - llama_stack/templates/groq/groq.py | 14 +- llama_stack/templates/groq/run.yaml | 22 - llama_stack/templates/hf-endpoint/build.yaml | 6 - .../templates/hf-endpoint/hf_endpoint.py | 2 - .../hf-endpoint/run-with-safety.yaml | 22 - llama_stack/templates/hf-endpoint/run.yaml | 22 - .../templates/hf-serverless/build.yaml | 6 - .../templates/hf-serverless/hf_serverless.py | 2 - .../hf-serverless/run-with-safety.yaml | 22 - llama_stack/templates/hf-serverless/run.yaml | 22 - .../templates/meta-reference-gpu/build.yaml | 6 - .../meta-reference-gpu/meta_reference.py | 2 - .../meta-reference-gpu/run-with-safety.yaml | 22 - .../templates/meta-reference-gpu/run.yaml | 22 - .../meta-reference-quantized-gpu/build.yaml | 6 - .../meta_reference.py | 2 - .../meta-reference-quantized-gpu/run.yaml | 22 - llama_stack/templates/nvidia/build.yaml | 4 - llama_stack/templates/nvidia/nvidia.py | 15 +- .../templates/nvidia/run-with-safety.yaml | 15 - llama_stack/templates/nvidia/run.yaml | 15 - llama_stack/templates/ollama/build.yaml | 6 - llama_stack/templates/ollama/ollama.py | 2 - .../templates/ollama/run-with-safety.yaml | 22 - llama_stack/templates/ollama/run.yaml | 22 - .../templates/open-benchmark/build.yaml | 6 - .../open-benchmark/open_benchmark.py | 60 +- llama_stack/templates/open-benchmark/run.yaml | 49 +- llama_stack/templates/passthrough/build.yaml | 6 - .../templates/passthrough/passthrough.py | 7 +- .../passthrough/run-with-safety.yaml | 22 - llama_stack/templates/passthrough/run.yaml | 22 - llama_stack/templates/remote-vllm/build.yaml | 6 - .../remote-vllm/run-with-safety.yaml | 22 - llama_stack/templates/remote-vllm/run.yaml | 22 - llama_stack/templates/remote-vllm/vllm.py | 2 - llama_stack/templates/sambanova/run.yaml | 1 - llama_stack/templates/tgi/build.yaml | 6 - .../templates/tgi/run-with-safety.yaml | 22 - llama_stack/templates/tgi/run.yaml | 22 - llama_stack/templates/tgi/tgi.py | 2 - llama_stack/templates/together/build.yaml | 6 - .../templates/together/run-with-safety.yaml | 22 - llama_stack/templates/together/run.yaml | 22 - llama_stack/templates/together/together.py | 8 +- llama_stack/templates/vllm-gpu/build.yaml | 6 - llama_stack/templates/vllm-gpu/run.yaml | 22 - llama_stack/templates/vllm-gpu/vllm.py | 2 - pyproject.toml | 4 +- tests/integration/eval/test_eval.py | 2 + tests/integration/scoring/test_scoring.py | 5 + 113 files changed, 408 insertions(+), 3900 deletions(-) delete mode 100644 llama_stack/apis/eval/__init__.py delete mode 100644 llama_stack/apis/eval/eval.py delete mode 100644 llama_stack/apis/scoring/__init__.py delete mode 100644 llama_stack/apis/scoring/scoring.py delete mode 100644 llama_stack/apis/scoring_functions/__init__.py delete mode 100644 llama_stack/apis/scoring_functions/scoring_functions.py create mode 100644 llama_stack/providers/inline/evaluation/meta_reference/__init__.py create mode 100644 llama_stack/providers/inline/evaluation/meta_reference/config.py create mode 100644 llama_stack/providers/inline/evaluation/meta_reference/evaluation.py delete mode 100644 llama_stack/providers/registry/eval.py create mode 100644 llama_stack/providers/registry/evaluation.py delete mode 100644 llama_stack/providers/registry/scoring.py diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 33b497a33..1767523d6 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,7 +1,6 @@ { "bedrock": [ "aiosqlite", - "autoevals", "blobfile", "boto3", "chardet", @@ -15,7 +14,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -30,12 +28,10 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "cerebras": [ "aiosqlite", - "autoevals", "blobfile", "cerebras_cloud_sdk", "chardet", @@ -48,7 +44,6 @@ "matplotlib", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -63,14 +58,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "ci-tests": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -83,7 +76,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -99,7 +91,6 @@ "sqlite-vec", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -107,7 +98,6 @@ "dell": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -120,7 +110,6 @@ "matplotlib", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -135,14 +124,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "dev": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -156,7 +143,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -172,14 +158,12 @@ "sqlite-vec", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "fireworks": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -193,7 +177,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -208,14 +191,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "groq": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "datasets", @@ -227,7 +208,6 @@ "matplotlib", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -242,13 +222,11 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "hf-endpoint": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -262,7 +240,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -277,13 +254,11 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "hf-serverless": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -297,7 +272,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -312,7 +286,6 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -320,7 +293,6 @@ "meta-reference-gpu": [ "accelerate", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -335,7 +307,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -353,14 +324,12 @@ "torchvision", "tqdm", "transformers", - "tree_sitter", "uvicorn", "zmq" ], "meta-reference-quantized-gpu": [ "accelerate", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -376,7 +345,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -395,7 +363,6 @@ "torchvision", "tqdm", "transformers", - "tree_sitter", "uvicorn", "zmq" ], @@ -425,13 +392,11 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "ollama": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -445,7 +410,6 @@ "nltk", "numpy", "ollama", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -460,12 +424,10 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "open-benchmark": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -478,7 +440,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -495,12 +456,10 @@ "together", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "passthrough": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -513,7 +472,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -528,14 +486,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "remote-vllm": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -563,7 +519,6 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -600,7 +555,6 @@ "tgi": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -614,7 +568,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -629,14 +582,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "together": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -649,7 +600,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -665,14 +615,12 @@ "together", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "vllm-gpu": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -685,7 +633,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -700,7 +647,6 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "vllm", "sentence-transformers --no-deps", diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index b1fe8e832..e3505752f 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -909,59 +909,6 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/evaluations": { - "post": { - "responses": { - "200": { - "description": "EvaluateResponse object containing generations and scores", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Evaluate a list of rows on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateRowsRequest" - } - } - }, - "required": true - } - } - }, "/v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}": { "get": { "responses": { @@ -1396,48 +1343,6 @@ ] } }, - "/v1/scoring-functions/{scoring_fn_id}": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoringFn" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [ - { - "name": "scoring_fn_id", - "in": "path", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/shields/{identifier}": { "get": { "responses": { @@ -2372,153 +2277,6 @@ ] } }, - "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}": { - "get": { - "responses": { - "200": { - "description": "The status of the evaluationjob.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/JobStatus" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Get the status of a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to get the status of.", - "required": true, - "schema": { - "type": "string" - } - } - ] - }, - "delete": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Cancel a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to cancel.", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, - "/v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result": { - "get": { - "responses": { - "200": { - "description": "The result of the job.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/EvaluateResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Get the result of a job.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - }, - { - "name": "job_id", - "in": "path", - "description": "The ID of the job to get the result of.", - "required": true, - "schema": { - "type": "string" - } - } - ] - } - }, "/v1/agents/{agent_id}/sessions": { "get": { "responses": { @@ -3050,73 +2808,6 @@ ] } }, - "/v1/scoring-functions": { - "get": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ListScoringFunctionsResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [] - }, - "post": { - "responses": { - "200": { - "description": "OK" - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "ScoringFunctions" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RegisterScoringFunctionRequest" - } - } - }, - "required": true - } - } - }, "/v1/shields": { "get": { "responses": { @@ -3744,59 +3435,6 @@ } } }, - "/v1/eval/benchmarks/{benchmark_id}/jobs": { - "post": { - "responses": { - "200": { - "description": "The job that was created to run the evaluation.", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Job" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Eval" - ], - "description": "Run an evaluation on a benchmark.", - "parameters": [ - { - "name": "benchmark_id", - "in": "path", - "description": "The ID of the benchmark to run the evaluation on.", - "required": true, - "schema": { - "type": "string" - } - } - ], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunEvalRequest" - } - } - }, - "required": true - } - } - }, "/v1/safety/run-shield": { "post": { "responses": { @@ -3919,92 +3557,6 @@ } } }, - "/v1/scoring/score": { - "post": { - "responses": { - "200": { - "description": "ScoreResponse object containing rows and aggregated results", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Scoring" - ], - "description": "Score a list of rows.", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreRequest" - } - } - }, - "required": true - } - } - }, - "/v1/scoring/score-batch": { - "post": { - "responses": { - "200": { - "description": "OK", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreBatchResponse" - } - } - } - }, - "400": { - "$ref": "#/components/responses/BadRequest400" - }, - "429": { - "$ref": "#/components/responses/TooManyRequests429" - }, - "500": { - "$ref": "#/components/responses/InternalServerError500" - }, - "default": { - "$ref": "#/components/responses/DefaultError" - } - }, - "tags": [ - "Scoring" - ], - "description": "", - "parameters": [], - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ScoreBatchRequest" - } - } - }, - "required": true - } - } - }, "/v1/post-training/supervised-fine-tune": { "post": { "responses": { @@ -6630,381 +6182,6 @@ "title": "EmbeddingsResponse", "description": "Response containing generated embeddings." }, - "AgentCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "agent", - "default": "agent" - }, - "config": { - "$ref": "#/components/schemas/AgentConfig", - "description": "The configuration for the agent candidate." - } - }, - "additionalProperties": false, - "required": [ - "type", - "config" - ], - "title": "AgentCandidate", - "description": "An agent candidate for evaluation." - }, - "AggregationFunctionType": { - "type": "string", - "enum": [ - "average", - "median", - "categorical_count", - "accuracy" - ], - "title": "AggregationFunctionType" - }, - "BasicScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "basic", - "default": "basic" - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "BasicScoringFnParams" - }, - "BenchmarkConfig": { - "type": "object", - "properties": { - "eval_candidate": { - "$ref": "#/components/schemas/EvalCandidate", - "description": "The candidate to evaluate." - }, - "scoring_params": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringFnParams" - }, - "description": "Map between scoring function id and parameters for each scoring function you want to run" - }, - "num_examples": { - "type": "integer", - "description": "(Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated" - } - }, - "additionalProperties": false, - "required": [ - "eval_candidate", - "scoring_params" - ], - "title": "BenchmarkConfig", - "description": "A benchmark configuration for evaluation." - }, - "EvalCandidate": { - "oneOf": [ - { - "$ref": "#/components/schemas/ModelCandidate" - }, - { - "$ref": "#/components/schemas/AgentCandidate" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "model": "#/components/schemas/ModelCandidate", - "agent": "#/components/schemas/AgentCandidate" - } - } - }, - "LLMAsJudgeScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "llm_as_judge", - "default": "llm_as_judge" - }, - "judge_model": { - "type": "string" - }, - "prompt_template": { - "type": "string" - }, - "judge_score_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type", - "judge_model" - ], - "title": "LLMAsJudgeScoringFnParams" - }, - "ModelCandidate": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "model", - "default": "model" - }, - "model": { - "type": "string", - "description": "The model ID to evaluate." - }, - "sampling_params": { - "$ref": "#/components/schemas/SamplingParams", - "description": "The sampling parameters for the model." - }, - "system_message": { - "$ref": "#/components/schemas/SystemMessage", - "description": "(Optional) The system message providing instructions or context to the model." - } - }, - "additionalProperties": false, - "required": [ - "type", - "model", - "sampling_params" - ], - "title": "ModelCandidate", - "description": "A model candidate for evaluation." - }, - "RegexParserScoringFnParams": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "regex_parser", - "default": "regex_parser" - }, - "parsing_regexes": { - "type": "array", - "items": { - "type": "string" - } - }, - "aggregation_functions": { - "type": "array", - "items": { - "$ref": "#/components/schemas/AggregationFunctionType" - } - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "RegexParserScoringFnParams" - }, - "ScoringFnParams": { - "oneOf": [ - { - "$ref": "#/components/schemas/LLMAsJudgeScoringFnParams" - }, - { - "$ref": "#/components/schemas/RegexParserScoringFnParams" - }, - { - "$ref": "#/components/schemas/BasicScoringFnParams" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "llm_as_judge": "#/components/schemas/LLMAsJudgeScoringFnParams", - "regex_parser": "#/components/schemas/RegexParserScoringFnParams", - "basic": "#/components/schemas/BasicScoringFnParams" - } - } - }, - "EvaluateRowsRequest": { - "type": "object", - "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The rows to evaluate." - }, - "scoring_functions": { - "type": "array", - "items": { - "type": "string" - }, - "description": "The scoring functions to use for the evaluation." - }, - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." - } - }, - "additionalProperties": false, - "required": [ - "input_rows", - "scoring_functions", - "benchmark_config" - ], - "title": "EvaluateRowsRequest" - }, - "EvaluateResponse": { - "type": "object", - "properties": { - "generations": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The generations from the evaluation." - }, - "scores": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "The scores from the evaluation." - } - }, - "additionalProperties": false, - "required": [ - "generations", - "scores" - ], - "title": "EvaluateResponse", - "description": "The response from an evaluation." - }, - "ScoringResult": { - "type": "object", - "properties": { - "score_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The scoring result for each row. Each row is a map of column name to value." - }, - "aggregated_results": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - }, - "description": "Map of metric name to aggregated value" - } - }, - "additionalProperties": false, - "required": [ - "score_rows", - "aggregated_results" - ], - "title": "ScoringResult", - "description": "A scoring result for a single row." - }, "Agent": { "type": "object", "properties": { @@ -7732,268 +6909,6 @@ ], "title": "ModelType" }, - "AgentTurnInputType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "agent_turn_input", - "default": "agent_turn_input" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "AgentTurnInputType" - }, - "ArrayType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "array", - "default": "array" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "ArrayType" - }, - "BooleanType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "boolean", - "default": "boolean" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "BooleanType" - }, - "ChatCompletionInputType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "chat_completion_input", - "default": "chat_completion_input" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "ChatCompletionInputType" - }, - "CompletionInputType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "completion_input", - "default": "completion_input" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "CompletionInputType" - }, - "JsonType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "json", - "default": "json" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "JsonType" - }, - "NumberType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "number", - "default": "number" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "NumberType" - }, - "ObjectType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "object", - "default": "object" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "ObjectType" - }, - "ParamType": { - "oneOf": [ - { - "$ref": "#/components/schemas/StringType" - }, - { - "$ref": "#/components/schemas/NumberType" - }, - { - "$ref": "#/components/schemas/BooleanType" - }, - { - "$ref": "#/components/schemas/ArrayType" - }, - { - "$ref": "#/components/schemas/ObjectType" - }, - { - "$ref": "#/components/schemas/JsonType" - }, - { - "$ref": "#/components/schemas/UnionType" - }, - { - "$ref": "#/components/schemas/ChatCompletionInputType" - }, - { - "$ref": "#/components/schemas/CompletionInputType" - }, - { - "$ref": "#/components/schemas/AgentTurnInputType" - } - ], - "discriminator": { - "propertyName": "type", - "mapping": { - "string": "#/components/schemas/StringType", - "number": "#/components/schemas/NumberType", - "boolean": "#/components/schemas/BooleanType", - "array": "#/components/schemas/ArrayType", - "object": "#/components/schemas/ObjectType", - "json": "#/components/schemas/JsonType", - "union": "#/components/schemas/UnionType", - "chat_completion_input": "#/components/schemas/ChatCompletionInputType", - "completion_input": "#/components/schemas/CompletionInputType", - "agent_turn_input": "#/components/schemas/AgentTurnInputType" - } - } - }, - "ScoringFn": { - "type": "object", - "properties": { - "identifier": { - "type": "string" - }, - "provider_resource_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "type": { - "type": "string", - "const": "scoring_function", - "default": "scoring_function" - }, - "description": { - "type": "string" - }, - "metadata": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "return_type": { - "$ref": "#/components/schemas/ParamType" - }, - "params": { - "$ref": "#/components/schemas/ScoringFnParams" - } - }, - "additionalProperties": false, - "required": [ - "identifier", - "provider_resource_id", - "provider_id", - "type", - "metadata", - "return_type" - ], - "title": "ScoringFn" - }, - "StringType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "string", - "default": "string" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "StringType" - }, - "UnionType": { - "type": "object", - "properties": { - "type": { - "type": "string", - "const": "union", - "default": "union" - } - }, - "additionalProperties": false, - "required": [ - "type" - ], - "title": "UnionType" - }, "Shield": { "type": "object", "properties": { @@ -8564,6 +7479,26 @@ ], "title": "GradeRequest" }, + "AgentCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "agent", + "default": "agent" + }, + "agent_config": { + "$ref": "#/components/schemas/AgentConfig" + } + }, + "additionalProperties": false, + "required": [ + "type", + "agent_config" + ], + "title": "AgentCandidate", + "description": "An agent candidate for evaluation." + }, "EvaluationCandidate": { "oneOf": [ { @@ -8636,6 +7571,35 @@ ], "title": "EvaluationJob" }, + "ModelCandidate": { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "model", + "default": "model" + }, + "model_id": { + "type": "string" + }, + "sampling_params": { + "$ref": "#/components/schemas/SamplingParams", + "description": "The sampling parameters for the model." + }, + "system_message": { + "$ref": "#/components/schemas/SystemMessage", + "description": "(Optional) The system message providing instructions or context to the model." + } + }, + "additionalProperties": false, + "required": [ + "type", + "model_id", + "sampling_params" + ], + "title": "ModelCandidate", + "description": "A model candidate for evaluation." + }, "GradeSyncRequest": { "type": "object", "properties": { @@ -9049,17 +8013,6 @@ "title": "IterrowsResponse", "description": "A paginated list of rows from a dataset." }, - "JobStatus": { - "type": "string", - "enum": [ - "completed", - "in_progress", - "failed", - "scheduled", - "cancelled" - ], - "title": "JobStatus" - }, "ListAgentSessionsResponse": { "type": "object", "properties": { @@ -9320,22 +8273,6 @@ ], "title": "ListRoutesResponse" }, - "ListScoringFunctionsResponse": { - "type": "object", - "properties": { - "data": { - "type": "array", - "items": { - "$ref": "#/components/schemas/ScoringFn" - } - } - }, - "additionalProperties": false, - "required": [ - "data" - ], - "title": "ListScoringFunctionsResponse" - }, "ListShieldsResponse": { "type": "object", "properties": { @@ -10520,36 +9457,6 @@ ], "title": "RegisterModelRequest" }, - "RegisterScoringFunctionRequest": { - "type": "object", - "properties": { - "scoring_fn_id": { - "type": "string" - }, - "description": { - "type": "string" - }, - "return_type": { - "$ref": "#/components/schemas/ParamType" - }, - "provider_scoring_fn_id": { - "type": "string" - }, - "provider_id": { - "type": "string" - }, - "params": { - "$ref": "#/components/schemas/ScoringFnParams" - } - }, - "additionalProperties": false, - "required": [ - "scoring_fn_id", - "description", - "return_type" - ], - "title": "RegisterScoringFunctionRequest" - }, "RegisterShieldRequest": { "type": "object", "properties": { @@ -10705,33 +9612,6 @@ ], "title": "RunRequest" }, - "RunEvalRequest": { - "type": "object", - "properties": { - "benchmark_config": { - "$ref": "#/components/schemas/BenchmarkConfig", - "description": "The configuration for the benchmark." - } - }, - "additionalProperties": false, - "required": [ - "benchmark_config" - ], - "title": "RunEvalRequest" - }, - "Job": { - "type": "object", - "properties": { - "job_id": { - "type": "string" - } - }, - "additionalProperties": false, - "required": [ - "job_id" - ], - "title": "Job" - }, "RunShieldRequest": { "type": "object", "properties": { @@ -10837,128 +9717,6 @@ ], "title": "SaveSpansToDatasetRequest" }, - "ScoreRequest": { - "type": "object", - "properties": { - "input_rows": { - "type": "array", - "items": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "type": "null" - }, - { - "type": "boolean" - }, - { - "type": "number" - }, - { - "type": "string" - }, - { - "type": "array" - }, - { - "type": "object" - } - ] - } - }, - "description": "The rows to score." - }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] - }, - "description": "The scoring functions to use for the scoring." - } - }, - "additionalProperties": false, - "required": [ - "input_rows", - "scoring_functions" - ], - "title": "ScoreRequest" - }, - "ScoreResponse": { - "type": "object", - "properties": { - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - }, - "description": "A map of scoring function name to ScoringResult." - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreResponse", - "description": "The response from scoring." - }, - "ScoreBatchRequest": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "scoring_functions": { - "type": "object", - "additionalProperties": { - "oneOf": [ - { - "$ref": "#/components/schemas/ScoringFnParams" - }, - { - "type": "null" - } - ] - } - }, - "save_results_dataset": { - "type": "boolean" - } - }, - "additionalProperties": false, - "required": [ - "dataset_id", - "scoring_functions", - "save_results_dataset" - ], - "title": "ScoreBatchRequest" - }, - "ScoreBatchResponse": { - "type": "object", - "properties": { - "dataset_id": { - "type": "string" - }, - "results": { - "type": "object", - "additionalProperties": { - "$ref": "#/components/schemas/ScoringResult" - } - } - }, - "additionalProperties": false, - "required": [ - "results" - ], - "title": "ScoreBatchResponse" - }, "LoraFinetuningConfig": { "type": "object", "properties": { @@ -11311,10 +10069,6 @@ { "name": "Datasets" }, - { - "name": "Eval", - "x-displayName": "Llama Stack Evaluation API for running evaluations on model and agent candidates." - }, { "name": "Evaluation" }, @@ -11345,12 +10099,6 @@ { "name": "Safety" }, - { - "name": "Scoring" - }, - { - "name": "ScoringFunctions" - }, { "name": "Shields" }, @@ -11382,7 +10130,6 @@ "Benchmarks", "DatasetIO", "Datasets", - "Eval", "Evaluation", "Files", "Graders", @@ -11392,8 +10139,6 @@ "PostTraining (Coming Soon)", "Providers", "Safety", - "Scoring", - "ScoringFunctions", "Shields", "SyntheticDataGeneration (Coming Soon)", "Telemetry", diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index f91744f98..1fee27e59 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -622,43 +622,6 @@ paths: schema: $ref: '#/components/schemas/EmbeddingsRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/evaluations: - post: - responses: - '200': - description: >- - EvaluateResponse object containing generations and scores - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Evaluate a list of rows on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateRowsRequest' - required: true /v1/agents/{agent_id}/session/{session_id}/turn/{turn_id}/step/{step_id}: get: responses: @@ -956,34 +919,6 @@ paths: required: true schema: type: string - /v1/scoring-functions/{scoring_fn_id}: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ScoringFn' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: - - name: scoring_fn_id - in: path - required: true - schema: - type: string /v1/shields/{identifier}: get: responses: @@ -1627,109 +1562,6 @@ paths: required: false schema: type: integer - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}: - get: - responses: - '200': - description: The status of the evaluationjob. - content: - application/json: - schema: - $ref: '#/components/schemas/JobStatus' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the status of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the status of. - required: true - schema: - type: string - delete: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Cancel a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to cancel. - required: true - schema: - type: string - /v1/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result: - get: - responses: - '200': - description: The result of the job. - content: - application/json: - schema: - $ref: '#/components/schemas/EvaluateResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Get the result of a job. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - - name: job_id - in: path - description: The ID of the job to get the result of. - required: true - schema: - type: string /v1/agents/{agent_id}/sessions: get: responses: @@ -2098,53 +1930,6 @@ paths: required: false schema: $ref: '#/components/schemas/URL' - /v1/scoring-functions: - get: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ListScoringFunctionsResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - post: - responses: - '200': - description: OK - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - ScoringFunctions - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RegisterScoringFunctionRequest' - required: true /v1/shields: get: responses: @@ -2581,43 +2366,6 @@ paths: schema: $ref: '#/components/schemas/RunRequest' required: true - /v1/eval/benchmarks/{benchmark_id}/jobs: - post: - responses: - '200': - description: >- - The job that was created to run the evaluation. - content: - application/json: - schema: - $ref: '#/components/schemas/Job' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Eval - description: Run an evaluation on a benchmark. - parameters: - - name: benchmark_id - in: path - description: >- - The ID of the benchmark to run the evaluation on. - required: true - schema: - type: string - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/RunEvalRequest' - required: true /v1/safety/run-shield: post: responses: @@ -2704,65 +2452,6 @@ paths: schema: $ref: '#/components/schemas/SaveSpansToDatasetRequest' required: true - /v1/scoring/score: - post: - responses: - '200': - description: >- - ScoreResponse object containing rows and aggregated results - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - description: Score a list of rows. - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreRequest' - required: true - /v1/scoring/score-batch: - post: - responses: - '200': - description: OK - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchResponse' - '400': - $ref: '#/components/responses/BadRequest400' - '429': - $ref: >- - #/components/responses/TooManyRequests429 - '500': - $ref: >- - #/components/responses/InternalServerError500 - default: - $ref: '#/components/responses/DefaultError' - tags: - - Scoring - description: '' - parameters: [] - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/ScoreBatchRequest' - required: true /v1/post-training/supervised-fine-tune: post: responses: @@ -4652,251 +4341,6 @@ components: title: EmbeddingsResponse description: >- Response containing generated embeddings. - AgentCandidate: - type: object - properties: - type: - type: string - const: agent - default: agent - config: - $ref: '#/components/schemas/AgentConfig' - description: >- - The configuration for the agent candidate. - additionalProperties: false - required: - - type - - config - title: AgentCandidate - description: An agent candidate for evaluation. - AggregationFunctionType: - type: string - enum: - - average - - median - - categorical_count - - accuracy - title: AggregationFunctionType - BasicScoringFnParams: - type: object - properties: - type: - type: string - const: basic - default: basic - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - title: BasicScoringFnParams - BenchmarkConfig: - type: object - properties: - eval_candidate: - $ref: '#/components/schemas/EvalCandidate' - description: The candidate to evaluate. - scoring_params: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringFnParams' - description: >- - Map between scoring function id and parameters for each scoring function - you want to run - num_examples: - type: integer - description: >- - (Optional) The number of examples to evaluate. If not provided, all examples - in the dataset will be evaluated - additionalProperties: false - required: - - eval_candidate - - scoring_params - title: BenchmarkConfig - description: >- - A benchmark configuration for evaluation. - EvalCandidate: - oneOf: - - $ref: '#/components/schemas/ModelCandidate' - - $ref: '#/components/schemas/AgentCandidate' - discriminator: - propertyName: type - mapping: - model: '#/components/schemas/ModelCandidate' - agent: '#/components/schemas/AgentCandidate' - LLMAsJudgeScoringFnParams: - type: object - properties: - type: - type: string - const: llm_as_judge - default: llm_as_judge - judge_model: - type: string - prompt_template: - type: string - judge_score_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - - judge_model - title: LLMAsJudgeScoringFnParams - ModelCandidate: - type: object - properties: - type: - type: string - const: model - default: model - model: - type: string - description: The model ID to evaluate. - sampling_params: - $ref: '#/components/schemas/SamplingParams' - description: The sampling parameters for the model. - system_message: - $ref: '#/components/schemas/SystemMessage' - description: >- - (Optional) The system message providing instructions or context to the - model. - additionalProperties: false - required: - - type - - model - - sampling_params - title: ModelCandidate - description: A model candidate for evaluation. - RegexParserScoringFnParams: - type: object - properties: - type: - type: string - const: regex_parser - default: regex_parser - parsing_regexes: - type: array - items: - type: string - aggregation_functions: - type: array - items: - $ref: '#/components/schemas/AggregationFunctionType' - additionalProperties: false - required: - - type - title: RegexParserScoringFnParams - ScoringFnParams: - oneOf: - - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams' - - $ref: '#/components/schemas/RegexParserScoringFnParams' - - $ref: '#/components/schemas/BasicScoringFnParams' - discriminator: - propertyName: type - mapping: - llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams' - regex_parser: '#/components/schemas/RegexParserScoringFnParams' - basic: '#/components/schemas/BasicScoringFnParams' - EvaluateRowsRequest: - type: object - properties: - input_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The rows to evaluate. - scoring_functions: - type: array - items: - type: string - description: >- - The scoring functions to use for the evaluation. - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. - additionalProperties: false - required: - - input_rows - - scoring_functions - - benchmark_config - title: EvaluateRowsRequest - EvaluateResponse: - type: object - properties: - generations: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The generations from the evaluation. - scores: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: The scores from the evaluation. - additionalProperties: false - required: - - generations - - scores - title: EvaluateResponse - description: The response from an evaluation. - ScoringResult: - type: object - properties: - score_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: >- - The scoring result for each row. Each row is a map of column name to value. - aggregated_results: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: Map of metric name to aggregated value - additionalProperties: false - required: - - score_rows - - aggregated_results - title: ScoringResult - description: A scoring result for a single row. Agent: type: object properties: @@ -5392,179 +4836,6 @@ components: - llm - embedding title: ModelType - AgentTurnInputType: - type: object - properties: - type: - type: string - const: agent_turn_input - default: agent_turn_input - additionalProperties: false - required: - - type - title: AgentTurnInputType - ArrayType: - type: object - properties: - type: - type: string - const: array - default: array - additionalProperties: false - required: - - type - title: ArrayType - BooleanType: - type: object - properties: - type: - type: string - const: boolean - default: boolean - additionalProperties: false - required: - - type - title: BooleanType - ChatCompletionInputType: - type: object - properties: - type: - type: string - const: chat_completion_input - default: chat_completion_input - additionalProperties: false - required: - - type - title: ChatCompletionInputType - CompletionInputType: - type: object - properties: - type: - type: string - const: completion_input - default: completion_input - additionalProperties: false - required: - - type - title: CompletionInputType - JsonType: - type: object - properties: - type: - type: string - const: json - default: json - additionalProperties: false - required: - - type - title: JsonType - NumberType: - type: object - properties: - type: - type: string - const: number - default: number - additionalProperties: false - required: - - type - title: NumberType - ObjectType: - type: object - properties: - type: - type: string - const: object - default: object - additionalProperties: false - required: - - type - title: ObjectType - ParamType: - oneOf: - - $ref: '#/components/schemas/StringType' - - $ref: '#/components/schemas/NumberType' - - $ref: '#/components/schemas/BooleanType' - - $ref: '#/components/schemas/ArrayType' - - $ref: '#/components/schemas/ObjectType' - - $ref: '#/components/schemas/JsonType' - - $ref: '#/components/schemas/UnionType' - - $ref: '#/components/schemas/ChatCompletionInputType' - - $ref: '#/components/schemas/CompletionInputType' - - $ref: '#/components/schemas/AgentTurnInputType' - discriminator: - propertyName: type - mapping: - string: '#/components/schemas/StringType' - number: '#/components/schemas/NumberType' - boolean: '#/components/schemas/BooleanType' - array: '#/components/schemas/ArrayType' - object: '#/components/schemas/ObjectType' - json: '#/components/schemas/JsonType' - union: '#/components/schemas/UnionType' - chat_completion_input: '#/components/schemas/ChatCompletionInputType' - completion_input: '#/components/schemas/CompletionInputType' - agent_turn_input: '#/components/schemas/AgentTurnInputType' - ScoringFn: - type: object - properties: - identifier: - type: string - provider_resource_id: - type: string - provider_id: - type: string - type: - type: string - const: scoring_function - default: scoring_function - description: - type: string - metadata: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - return_type: - $ref: '#/components/schemas/ParamType' - params: - $ref: '#/components/schemas/ScoringFnParams' - additionalProperties: false - required: - - identifier - - provider_resource_id - - provider_id - - type - - metadata - - return_type - title: ScoringFn - StringType: - type: object - properties: - type: - type: string - const: string - default: string - additionalProperties: false - required: - - type - title: StringType - UnionType: - type: object - properties: - type: - type: string - const: union - default: union - additionalProperties: false - required: - - type - title: UnionType Shield: type: object properties: @@ -5947,6 +5218,21 @@ components: required: - task title: GradeRequest + AgentCandidate: + type: object + properties: + type: + type: string + const: agent + default: agent + agent_config: + $ref: '#/components/schemas/AgentConfig' + additionalProperties: false + required: + - type + - agent_config + title: AgentCandidate + description: An agent candidate for evaluation. EvaluationCandidate: oneOf: - $ref: '#/components/schemas/ModelCandidate' @@ -6000,6 +5286,30 @@ components: - task - candidate title: EvaluationJob + ModelCandidate: + type: object + properties: + type: + type: string + const: model + default: model + model_id: + type: string + sampling_params: + $ref: '#/components/schemas/SamplingParams' + description: The sampling parameters for the model. + system_message: + $ref: '#/components/schemas/SystemMessage' + description: >- + (Optional) The system message providing instructions or context to the + model. + additionalProperties: false + required: + - type + - model_id + - sampling_params + title: ModelCandidate + description: A model candidate for evaluation. GradeSyncRequest: type: object properties: @@ -6235,15 +5545,6 @@ components: - data title: IterrowsResponse description: A paginated list of rows from a dataset. - JobStatus: - type: string - enum: - - completed - - in_progress - - failed - - scheduled - - cancelled - title: JobStatus ListAgentSessionsResponse: type: object properties: @@ -6436,17 +5737,6 @@ components: required: - data title: ListRoutesResponse - ListScoringFunctionsResponse: - type: object - properties: - data: - type: array - items: - $ref: '#/components/schemas/ScoringFn' - additionalProperties: false - required: - - data - title: ListScoringFunctionsResponse ListShieldsResponse: type: object properties: @@ -7223,27 +6513,6 @@ components: required: - model_id title: RegisterModelRequest - RegisterScoringFunctionRequest: - type: object - properties: - scoring_fn_id: - type: string - description: - type: string - return_type: - $ref: '#/components/schemas/ParamType' - provider_scoring_fn_id: - type: string - provider_id: - type: string - params: - $ref: '#/components/schemas/ScoringFnParams' - additionalProperties: false - required: - - scoring_fn_id - - description - - return_type - title: RegisterScoringFunctionRequest RegisterShieldRequest: type: object properties: @@ -7344,25 +6613,6 @@ components: - task - candidate title: RunRequest - RunEvalRequest: - type: object - properties: - benchmark_config: - $ref: '#/components/schemas/BenchmarkConfig' - description: The configuration for the benchmark. - additionalProperties: false - required: - - benchmark_config - title: RunEvalRequest - Job: - type: object - properties: - job_id: - type: string - additionalProperties: false - required: - - job_id - title: Job RunShieldRequest: type: object properties: @@ -7435,81 +6685,6 @@ components: - attributes_to_save - dataset_id title: SaveSpansToDatasetRequest - ScoreRequest: - type: object - properties: - input_rows: - type: array - items: - type: object - additionalProperties: - oneOf: - - type: 'null' - - type: boolean - - type: number - - type: string - - type: array - - type: object - description: The rows to score. - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - description: >- - The scoring functions to use for the scoring. - additionalProperties: false - required: - - input_rows - - scoring_functions - title: ScoreRequest - ScoreResponse: - type: object - properties: - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - description: >- - A map of scoring function name to ScoringResult. - additionalProperties: false - required: - - results - title: ScoreResponse - description: The response from scoring. - ScoreBatchRequest: - type: object - properties: - dataset_id: - type: string - scoring_functions: - type: object - additionalProperties: - oneOf: - - $ref: '#/components/schemas/ScoringFnParams' - - type: 'null' - save_results_dataset: - type: boolean - additionalProperties: false - required: - - dataset_id - - scoring_functions - - save_results_dataset - title: ScoreBatchRequest - ScoreBatchResponse: - type: object - properties: - dataset_id: - type: string - results: - type: object - additionalProperties: - $ref: '#/components/schemas/ScoringResult' - additionalProperties: false - required: - - results - title: ScoreBatchResponse LoraFinetuningConfig: type: object properties: @@ -7740,9 +6915,6 @@ tags: - name: Benchmarks - name: DatasetIO - name: Datasets - - name: Eval - x-displayName: >- - Llama Stack Evaluation API for running evaluations on model and agent candidates. - name: Evaluation - name: Files - name: Graders @@ -7765,8 +6937,6 @@ tags: x-displayName: >- Providers API for inspecting, listing, and modifying providers and their configurations. - name: Safety - - name: Scoring - - name: ScoringFunctions - name: Shields - name: SyntheticDataGeneration (Coming Soon) - name: Telemetry @@ -7782,7 +6952,6 @@ x-tagGroups: - Benchmarks - DatasetIO - Datasets - - Eval - Evaluation - Files - Graders @@ -7792,8 +6961,6 @@ x-tagGroups: - PostTraining (Coming Soon) - Providers - Safety - - Scoring - - ScoringFunctions - Shields - SyntheticDataGeneration (Coming Soon) - Telemetry diff --git a/docs/source/distributions/remote_hosted_distro/nvidia.md b/docs/source/distributions/remote_hosted_distro/nvidia.md index 774d5ec1b..5d7bb4ef4 100644 --- a/docs/source/distributions/remote_hosted_distro/nvidia.md +++ b/docs/source/distributions/remote_hosted_distro/nvidia.md @@ -7,10 +7,8 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::nvidia` | | safety | `remote::nvidia` | -| scoring | `inline::basic` | | telemetry | `inline::meta-reference` | | tool_runtime | `inline::rag-runtime` | | vector_io | `inline::faiss` | diff --git a/docs/source/distributions/self_hosted_distro/bedrock.md b/docs/source/distributions/self_hosted_distro/bedrock.md index 623ab6848..92a934222 100644 --- a/docs/source/distributions/self_hosted_distro/bedrock.md +++ b/docs/source/distributions/self_hosted_distro/bedrock.md @@ -14,10 +14,8 @@ The `llamastack/distribution-bedrock` distribution consists of the following pro |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::bedrock` | | safety | `remote::bedrock` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/cerebras.md b/docs/source/distributions/self_hosted_distro/cerebras.md index 8f14ae7cc..4b6673830 100644 --- a/docs/source/distributions/self_hosted_distro/cerebras.md +++ b/docs/source/distributions/self_hosted_distro/cerebras.md @@ -7,10 +7,8 @@ The `llamastack/distribution-cerebras` distribution consists of the following pr |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::cerebras`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md index 3c8f5eec9..1ab3db57d 100644 --- a/docs/source/distributions/self_hosted_distro/fireworks.md +++ b/docs/source/distributions/self_hosted_distro/fireworks.md @@ -17,10 +17,8 @@ The `llamastack/distribution-fireworks` distribution consists of the following p |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::fireworks`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/groq.md b/docs/source/distributions/self_hosted_distro/groq.md index ce3f8aecc..cc25bc955 100644 --- a/docs/source/distributions/self_hosted_distro/groq.md +++ b/docs/source/distributions/self_hosted_distro/groq.md @@ -17,10 +17,8 @@ The `llamastack/distribution-groq` distribution consists of the following provid |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::groq` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime` | | vector_io | `inline::faiss` | diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md index b8d1b1714..c5e5fccc2 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md @@ -17,10 +17,8 @@ The `llamastack/distribution-meta-reference-gpu` distribution consists of the fo |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `inline::meta-reference` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md index a49175e22..9dcffb536 100644 --- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md +++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md @@ -17,10 +17,8 @@ The `llamastack/distribution-meta-reference-quantized-gpu` distribution consists |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `inline::meta-reference-quantized` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md index 9bfa4211c..f525c24aa 100644 --- a/docs/source/distributions/self_hosted_distro/ollama.md +++ b/docs/source/distributions/self_hosted_distro/ollama.md @@ -17,10 +17,8 @@ The `llamastack/distribution-ollama` distribution consists of the following prov |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::ollama` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/passthrough.md b/docs/source/distributions/self_hosted_distro/passthrough.md index 558d7ca08..b13f41cad 100644 --- a/docs/source/distributions/self_hosted_distro/passthrough.md +++ b/docs/source/distributions/self_hosted_distro/passthrough.md @@ -17,10 +17,8 @@ The `llamastack/distribution-passthrough` distribution consists of the following |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::passthrough`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `remote::wolfram-alpha`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md index b7e155385..b33105cb2 100644 --- a/docs/source/distributions/self_hosted_distro/remote-vllm.md +++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md @@ -16,10 +16,8 @@ The `llamastack/distribution-remote-vllm` distribution consists of the following |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::vllm`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md index e126f9a08..6852f843c 100644 --- a/docs/source/distributions/self_hosted_distro/tgi.md +++ b/docs/source/distributions/self_hosted_distro/tgi.md @@ -18,10 +18,8 @@ The `llamastack/distribution-tgi` distribution consists of the following provide |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::tgi`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md index fa02199b0..ebfda2e26 100644 --- a/docs/source/distributions/self_hosted_distro/together.md +++ b/docs/source/distributions/self_hosted_distro/together.md @@ -17,10 +17,8 @@ The `llamastack/distribution-together` distribution consists of the following pr |-----|-------------| | agents | `inline::meta-reference` | | datasetio | `remote::huggingface`, `inline::localfs` | -| eval | `inline::meta-reference` | | inference | `remote::together`, `inline::sentence-transformers` | | safety | `inline::llama-guard` | -| scoring | `inline::basic`, `inline::llm-as-judge`, `inline::braintrust` | | telemetry | `inline::meta-reference` | | tool_runtime | `remote::brave-search`, `remote::tavily-search`, `inline::code-interpreter`, `inline::rag-runtime`, `remote::model-context-protocol`, `remote::wolfram-alpha` | | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` | diff --git a/llama_stack/apis/datatypes.py b/llama_stack/apis/datatypes.py index f644e5137..a373d8165 100644 --- a/llama_stack/apis/datatypes.py +++ b/llama_stack/apis/datatypes.py @@ -20,10 +20,9 @@ class Api(Enum): agents = "agents" vector_io = "vector_io" datasetio = "datasetio" - scoring = "scoring" - eval = "eval" post_training = "post_training" tool_runtime = "tool_runtime" + evaluation = "evaluation" telemetry = "telemetry" @@ -31,7 +30,6 @@ class Api(Enum): shields = "shields" vector_dbs = "vector_dbs" datasets = "datasets" - scoring_functions = "scoring_functions" benchmarks = "benchmarks" tool_groups = "tool_groups" diff --git a/llama_stack/apis/eval/__init__.py b/llama_stack/apis/eval/__init__.py deleted file mode 100644 index 5f91ad70d..000000000 --- a/llama_stack/apis/eval/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .eval import * # noqa: F401 F403 diff --git a/llama_stack/apis/eval/eval.py b/llama_stack/apis/eval/eval.py deleted file mode 100644 index 51c38b16a..000000000 --- a/llama_stack/apis/eval/eval.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any, Dict, List, Literal, Optional, Protocol, Union - -from pydantic import BaseModel, Field -from typing_extensions import Annotated - -from llama_stack.apis.agents import AgentConfig -from llama_stack.apis.common.job_types import Job, JobStatus -from llama_stack.apis.inference import SamplingParams, SystemMessage -from llama_stack.apis.scoring import ScoringResult -from llama_stack.apis.scoring_functions import ScoringFnParams -from llama_stack.schema_utils import json_schema_type, register_schema, webmethod - - -@json_schema_type -class ModelCandidate(BaseModel): - """A model candidate for evaluation. - - :param model: The model ID to evaluate. - :param sampling_params: The sampling parameters for the model. - :param system_message: (Optional) The system message providing instructions or context to the model. - """ - - type: Literal["model"] = "model" - model: str - sampling_params: SamplingParams - system_message: Optional[SystemMessage] = None - - -@json_schema_type -class AgentCandidate(BaseModel): - """An agent candidate for evaluation. - - :param config: The configuration for the agent candidate. - """ - - type: Literal["agent"] = "agent" - config: AgentConfig - - -EvalCandidate = register_schema( - Annotated[Union[ModelCandidate, AgentCandidate], Field(discriminator="type")], - name="EvalCandidate", -) - - -@json_schema_type -class BenchmarkConfig(BaseModel): - """A benchmark configuration for evaluation. - - :param eval_candidate: The candidate to evaluate. - :param scoring_params: Map between scoring function id and parameters for each scoring function you want to run - :param num_examples: (Optional) The number of examples to evaluate. If not provided, all examples in the dataset will be evaluated - """ - - eval_candidate: EvalCandidate - scoring_params: Dict[str, ScoringFnParams] = Field( - description="Map between scoring function id and parameters for each scoring function you want to run", - default_factory=dict, - ) - num_examples: Optional[int] = Field( - description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated", - default=None, - ) - # we could optinally add any specific dataset config here - - -@json_schema_type -class EvaluateResponse(BaseModel): - """The response from an evaluation. - - :param generations: The generations from the evaluation. - :param scores: The scores from the evaluation. - """ - - generations: List[Dict[str, Any]] - # each key in the dict is a scoring function name - scores: Dict[str, ScoringResult] - - -class Eval(Protocol): - """Llama Stack Evaluation API for running evaluations on model and agent candidates.""" - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs", method="POST") - async def run_eval( - self, - benchmark_id: str, - benchmark_config: BenchmarkConfig, - ) -> Job: - """Run an evaluation on a benchmark. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param benchmark_config: The configuration for the benchmark. - :return: The job that was created to run the evaluation. - """ - - @webmethod(route="/eval/benchmarks/{benchmark_id}/evaluations", method="POST") - async def evaluate_rows( - self, - benchmark_id: str, - input_rows: List[Dict[str, Any]], - scoring_functions: List[str], - benchmark_config: BenchmarkConfig, - ) -> EvaluateResponse: - """Evaluate a list of rows on a benchmark. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param input_rows: The rows to evaluate. - :param scoring_functions: The scoring functions to use for the evaluation. - :param benchmark_config: The configuration for the benchmark. - :return: EvaluateResponse object containing generations and scores - """ - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="GET") - async def job_status(self, benchmark_id: str, job_id: str) -> JobStatus: - """Get the status of a job. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param job_id: The ID of the job to get the status of. - :return: The status of the evaluationjob. - """ - ... - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}", method="DELETE") - async def job_cancel(self, benchmark_id: str, job_id: str) -> None: - """Cancel a job. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param job_id: The ID of the job to cancel. - """ - ... - - @webmethod(route="/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result", method="GET") - async def job_result(self, benchmark_id: str, job_id: str) -> EvaluateResponse: - """Get the result of a job. - - :param benchmark_id: The ID of the benchmark to run the evaluation on. - :param job_id: The ID of the job to get the result of. - :return: The result of the job. - """ diff --git a/llama_stack/apis/scoring/__init__.py b/llama_stack/apis/scoring/__init__.py deleted file mode 100644 index 0739dfc80..000000000 --- a/llama_stack/apis/scoring/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .scoring import * # noqa: F401 F403 diff --git a/llama_stack/apis/scoring/scoring.py b/llama_stack/apis/scoring/scoring.py deleted file mode 100644 index 54a9ac2aa..000000000 --- a/llama_stack/apis/scoring/scoring.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import Any, Dict, List, Optional, Protocol, runtime_checkable - -from pydantic import BaseModel - -from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnParams -from llama_stack.schema_utils import json_schema_type, webmethod - -# mapping of metric to value -ScoringResultRow = Dict[str, Any] - - -@json_schema_type -class ScoringResult(BaseModel): - """ - A scoring result for a single row. - - :param score_rows: The scoring result for each row. Each row is a map of column name to value. - :param aggregated_results: Map of metric name to aggregated value - """ - - score_rows: List[ScoringResultRow] - # aggregated metrics to value - aggregated_results: Dict[str, Any] - - -@json_schema_type -class ScoreBatchResponse(BaseModel): - dataset_id: Optional[str] = None - results: Dict[str, ScoringResult] - - -@json_schema_type -class ScoreResponse(BaseModel): - """ - The response from scoring. - - :param results: A map of scoring function name to ScoringResult. - """ - - # each key in the dict is a scoring function name - results: Dict[str, ScoringResult] - - -class ScoringFunctionStore(Protocol): - def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: ... - - -@runtime_checkable -class Scoring(Protocol): - scoring_function_store: ScoringFunctionStore - - @webmethod(route="/scoring/score-batch", method="POST") - async def score_batch( - self, - dataset_id: str, - scoring_functions: Dict[str, Optional[ScoringFnParams]], - save_results_dataset: bool = False, - ) -> ScoreBatchResponse: ... - - @webmethod(route="/scoring/score", method="POST") - async def score( - self, - input_rows: List[Dict[str, Any]], - scoring_functions: Dict[str, Optional[ScoringFnParams]], - ) -> ScoreResponse: - """Score a list of rows. - - :param input_rows: The rows to score. - :param scoring_functions: The scoring functions to use for the scoring. - :return: ScoreResponse object containing rows and aggregated results - """ - ... diff --git a/llama_stack/apis/scoring_functions/__init__.py b/llama_stack/apis/scoring_functions/__init__.py deleted file mode 100644 index b96acb45f..000000000 --- a/llama_stack/apis/scoring_functions/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from .scoring_functions import * # noqa: F401 F403 diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py deleted file mode 100644 index b02a7a0c4..000000000 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from enum import Enum -from typing import ( - Any, - Dict, - List, - Literal, - Optional, - Protocol, - Union, - runtime_checkable, -) - -from pydantic import BaseModel, Field -from typing_extensions import Annotated - -from llama_stack.apis.common.type_system import ParamType -from llama_stack.apis.resource import Resource, ResourceType -from llama_stack.schema_utils import json_schema_type, register_schema, webmethod - - -# Perhaps more structure can be imposed on these functions. Maybe they could be associated -# with standard metrics so they can be rolled up? -@json_schema_type -class ScoringFnParamsType(Enum): - llm_as_judge = "llm_as_judge" - regex_parser = "regex_parser" - basic = "basic" - - -@json_schema_type -class AggregationFunctionType(Enum): - average = "average" - median = "median" - categorical_count = "categorical_count" - accuracy = "accuracy" - - -@json_schema_type -class LLMAsJudgeScoringFnParams(BaseModel): - type: Literal[ScoringFnParamsType.llm_as_judge.value] = ScoringFnParamsType.llm_as_judge.value - judge_model: str - prompt_template: Optional[str] = None - judge_score_regexes: Optional[List[str]] = Field( - description="Regexes to extract the answer from generated response", - default_factory=list, - ) - aggregation_functions: Optional[List[AggregationFunctionType]] = Field( - description="Aggregation functions to apply to the scores of each row", - default_factory=list, - ) - - -@json_schema_type -class RegexParserScoringFnParams(BaseModel): - type: Literal[ScoringFnParamsType.regex_parser.value] = ScoringFnParamsType.regex_parser.value - parsing_regexes: Optional[List[str]] = Field( - description="Regex to extract the answer from generated response", - default_factory=list, - ) - aggregation_functions: Optional[List[AggregationFunctionType]] = Field( - description="Aggregation functions to apply to the scores of each row", - default_factory=list, - ) - - -@json_schema_type -class BasicScoringFnParams(BaseModel): - type: Literal[ScoringFnParamsType.basic.value] = ScoringFnParamsType.basic.value - aggregation_functions: Optional[List[AggregationFunctionType]] = Field( - description="Aggregation functions to apply to the scores of each row", - default_factory=list, - ) - - -ScoringFnParams = register_schema( - Annotated[ - Union[ - LLMAsJudgeScoringFnParams, - RegexParserScoringFnParams, - BasicScoringFnParams, - ], - Field(discriminator="type"), - ], - name="ScoringFnParams", -) - - -class CommonScoringFnFields(BaseModel): - description: Optional[str] = None - metadata: Dict[str, Any] = Field( - default_factory=dict, - description="Any additional metadata for this definition", - ) - return_type: ParamType = Field( - description="The return type of the deterministic function", - ) - params: Optional[ScoringFnParams] = Field( - description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval", - default=None, - ) - - -@json_schema_type -class ScoringFn(CommonScoringFnFields, Resource): - type: Literal[ResourceType.scoring_function.value] = ResourceType.scoring_function.value - - @property - def scoring_fn_id(self) -> str: - return self.identifier - - @property - def provider_scoring_fn_id(self) -> str: - return self.provider_resource_id - - -class ScoringFnInput(CommonScoringFnFields, BaseModel): - scoring_fn_id: str - provider_id: Optional[str] = None - provider_scoring_fn_id: Optional[str] = None - - -class ListScoringFunctionsResponse(BaseModel): - data: List[ScoringFn] - - -@runtime_checkable -class ScoringFunctions(Protocol): - @webmethod(route="/scoring-functions", method="GET") - async def list_scoring_functions(self) -> ListScoringFunctionsResponse: ... - - @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="GET") - async def get_scoring_function(self, scoring_fn_id: str, /) -> ScoringFn: ... - - @webmethod(route="/scoring-functions", method="POST") - async def register_scoring_function( - self, - scoring_fn_id: str, - description: str, - return_type: ParamType, - provider_scoring_fn_id: Optional[str] = None, - provider_id: Optional[str] = None, - params: Optional[ScoringFnParams] = None, - ) -> None: ... diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py index e16e047e5..fea22a414 100644 --- a/llama_stack/distribution/datatypes.py +++ b/llama_stack/distribution/datatypes.py @@ -11,12 +11,9 @@ from pydantic import BaseModel, Field from llama_stack.apis.benchmarks import Benchmark, BenchmarkInput from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Dataset, DatasetInput -from llama_stack.apis.eval import Eval from llama_stack.apis.inference import Inference from llama_stack.apis.models import Model, ModelInput from llama_stack.apis.safety import Safety -from llama_stack.apis.scoring import Scoring -from llama_stack.apis.scoring_functions import ScoringFn, ScoringFnInput from llama_stack.apis.shields import Shield, ShieldInput from llama_stack.apis.tools import Tool, ToolGroup, ToolGroupInput, ToolRuntime from llama_stack.apis.vector_dbs import VectorDB, VectorDBInput @@ -36,7 +33,6 @@ RoutableObject = Union[ Shield, VectorDB, Dataset, - ScoringFn, Benchmark, Tool, ToolGroup, @@ -49,7 +45,6 @@ RoutableObjectWithProvider = Annotated[ Shield, VectorDB, Dataset, - ScoringFn, Benchmark, Tool, ToolGroup, @@ -62,8 +57,6 @@ RoutedProtocol = Union[ Safety, VectorIO, DatasetIO, - Scoring, - Eval, ToolRuntime, ] @@ -191,7 +184,6 @@ a default SQLite store will be used.""", shields: List[ShieldInput] = Field(default_factory=list) vector_dbs: List[VectorDBInput] = Field(default_factory=list) datasets: List[DatasetInput] = Field(default_factory=list) - scoring_fns: List[ScoringFnInput] = Field(default_factory=list) benchmarks: List[BenchmarkInput] = Field(default_factory=list) tool_groups: List[ToolGroupInput] = Field(default_factory=list) diff --git a/llama_stack/distribution/distribution.py b/llama_stack/distribution/distribution.py index ddb727663..43c37806e 100644 --- a/llama_stack/distribution/distribution.py +++ b/llama_stack/distribution/distribution.py @@ -40,23 +40,19 @@ def builtin_automatically_routed_apis() -> List[AutoRoutedApiInfo]: router_api=Api.datasetio, ), AutoRoutedApiInfo( - routing_table_api=Api.scoring_functions, - router_api=Api.scoring, + routing_table_api=Api.tool_groups, + router_api=Api.tool_runtime, ), AutoRoutedApiInfo( routing_table_api=Api.benchmarks, - router_api=Api.eval, - ), - AutoRoutedApiInfo( - routing_table_api=Api.tool_groups, - router_api=Api.tool_runtime, + router_api=Api.evaluation, ), ] def providable_apis() -> List[Api]: routing_table_apis = {x.routing_table_api for x in builtin_automatically_routed_apis()} - return [api for api in Api if api not in routing_table_apis and api != Api.inspect and api != Api.providers] + return [api for api in Api if api not in routing_table_apis and api not in [Api.inspect, Api.providers]] def get_provider_registry() -> Dict[Api, Dict[str, ProviderSpec]]: diff --git a/llama_stack/distribution/resolver.py b/llama_stack/distribution/resolver.py index e9e406699..3a6140478 100644 --- a/llama_stack/distribution/resolver.py +++ b/llama_stack/distribution/resolver.py @@ -11,15 +11,12 @@ from llama_stack.apis.agents import Agents from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.eval import Eval from llama_stack.apis.inference import Inference from llama_stack.apis.inspect import Inspect from llama_stack.apis.models import Models from llama_stack.apis.post_training import PostTraining from llama_stack.apis.providers import Providers as ProvidersAPI from llama_stack.apis.safety import Safety -from llama_stack.apis.scoring import Scoring -from llama_stack.apis.scoring_functions import ScoringFunctions from llama_stack.apis.shields import Shields from llama_stack.apis.telemetry import Telemetry from llama_stack.apis.tools import ToolGroups, ToolRuntime @@ -38,14 +35,12 @@ from llama_stack.distribution.utils.dynamic import instantiate_class_type from llama_stack.log import get_logger from llama_stack.providers.datatypes import ( Api, - BenchmarksProtocolPrivate, DatasetsProtocolPrivate, InlineProviderSpec, ModelsProtocolPrivate, ProviderSpec, RemoteProviderConfig, RemoteProviderSpec, - ScoringFunctionsProtocolPrivate, ShieldsProtocolPrivate, ToolsProtocolPrivate, VectorDBsProtocolPrivate, @@ -72,9 +67,6 @@ def api_protocol_map() -> Dict[Api, Any]: Api.telemetry: Telemetry, Api.datasetio: DatasetIO, Api.datasets: Datasets, - Api.scoring: Scoring, - Api.scoring_functions: ScoringFunctions, - Api.eval: Eval, Api.benchmarks: Benchmarks, Api.post_training: PostTraining, Api.tool_groups: ToolGroups, @@ -89,12 +81,6 @@ def additional_protocols_map() -> Dict[Api, Any]: Api.vector_io: (VectorDBsProtocolPrivate, VectorDBs, Api.vector_dbs), Api.safety: (ShieldsProtocolPrivate, Shields, Api.shields), Api.datasetio: (DatasetsProtocolPrivate, Datasets, Api.datasets), - Api.scoring: ( - ScoringFunctionsProtocolPrivate, - ScoringFunctions, - Api.scoring_functions, - ), - Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks), } @@ -135,7 +121,9 @@ async def resolve_impls( return await instantiate_providers(sorted_providers, router_apis, dist_registry) -def specs_for_autorouted_apis(apis_to_serve: List[str] | Set[str]) -> Dict[str, Dict[str, ProviderWithSpec]]: +def specs_for_autorouted_apis( + apis_to_serve: List[str] | Set[str], +) -> Dict[str, Dict[str, ProviderWithSpec]]: """Generates specifications for automatically routed APIs.""" specs = {} for info in builtin_automatically_routed_apis(): @@ -177,7 +165,10 @@ def specs_for_autorouted_apis(apis_to_serve: List[str] | Set[str]) -> Dict[str, def validate_and_prepare_providers( - run_config: StackRunConfig, provider_registry: ProviderRegistry, routing_table_apis: Set[Api], router_apis: Set[Api] + run_config: StackRunConfig, + provider_registry: ProviderRegistry, + routing_table_apis: Set[Api], + router_apis: Set[Api], ) -> Dict[str, Dict[str, ProviderWithSpec]]: """Validates providers, handles deprecations, and organizes them into a spec dictionary.""" providers_with_specs: Dict[str, Dict[str, ProviderWithSpec]] = {} @@ -221,7 +212,8 @@ def validate_provider(provider: Provider, api: Api, provider_registry: ProviderR def sort_providers_by_deps( - providers_with_specs: Dict[str, Dict[str, ProviderWithSpec]], run_config: StackRunConfig + providers_with_specs: Dict[str, Dict[str, ProviderWithSpec]], + run_config: StackRunConfig, ) -> List[Tuple[str, ProviderWithSpec]]: """Sorts providers based on their dependencies.""" sorted_providers: List[Tuple[str, ProviderWithSpec]] = topological_sort( @@ -276,7 +268,9 @@ def sort_providers_by_deps( async def instantiate_providers( - sorted_providers: List[Tuple[str, ProviderWithSpec]], router_apis: Set[Api], dist_registry: DistributionRegistry + sorted_providers: List[Tuple[str, ProviderWithSpec]], + router_apis: Set[Api], + dist_registry: DistributionRegistry, ) -> Dict: """Instantiates providers asynchronously while managing dependencies.""" impls: Dict[Api, Any] = {} diff --git a/llama_stack/distribution/routers/__init__.py b/llama_stack/distribution/routers/__init__.py index d0fca8771..a9a4f87c8 100644 --- a/llama_stack/distribution/routers/__init__.py +++ b/llama_stack/distribution/routers/__init__.py @@ -14,7 +14,6 @@ from .routing_tables import ( BenchmarksRoutingTable, DatasetsRoutingTable, ModelsRoutingTable, - ScoringFunctionsRoutingTable, ShieldsRoutingTable, ToolGroupsRoutingTable, VectorDBsRoutingTable, @@ -32,7 +31,6 @@ async def get_routing_table_impl( "models": ModelsRoutingTable, "shields": ShieldsRoutingTable, "datasets": DatasetsRoutingTable, - "scoring_functions": ScoringFunctionsRoutingTable, "benchmarks": BenchmarksRoutingTable, "tool_groups": ToolGroupsRoutingTable, } @@ -48,10 +46,8 @@ async def get_routing_table_impl( async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: Dict[str, Any]) -> Any: from .routers import ( DatasetIORouter, - EvalRouter, InferenceRouter, SafetyRouter, - ScoringRouter, ToolRuntimeRouter, VectorIORouter, ) @@ -61,8 +57,6 @@ async def get_auto_router_impl(api: Api, routing_table: RoutingTable, deps: Dict "inference": InferenceRouter, "safety": SafetyRouter, "datasetio": DatasetIORouter, - "scoring": ScoringRouter, - "eval": EvalRouter, "tool_runtime": ToolRuntimeRouter, } api_to_deps = { diff --git a/llama_stack/distribution/routers/routers.py b/llama_stack/distribution/routers/routers.py index 2cf38f544..6c77d09e8 100644 --- a/llama_stack/distribution/routers/routers.py +++ b/llama_stack/distribution/routers/routers.py @@ -14,13 +14,6 @@ from llama_stack.apis.common.content_types import ( ) from llama_stack.apis.datasetio import DatasetIO, IterrowsResponse from llama_stack.apis.datasets import DatasetPurpose, DataSource -from llama_stack.apis.eval import ( - BenchmarkConfig, - Eval, - EvaluateResponse, - Job, - JobStatus, -) from llama_stack.apis.inference import ( ChatCompletionResponse, ChatCompletionResponseEventType, @@ -42,12 +35,6 @@ from llama_stack.apis.inference import ( ) from llama_stack.apis.models import Model, ModelType from llama_stack.apis.safety import RunShieldResponse, Safety -from llama_stack.apis.scoring import ( - ScoreBatchResponse, - ScoreResponse, - Scoring, - ScoringFnParams, -) from llama_stack.apis.shields import Shield from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry from llama_stack.apis.tools import ( @@ -521,135 +508,6 @@ class DatasetIORouter(DatasetIO): ) -class ScoringRouter(Scoring): - def __init__( - self, - routing_table: RoutingTable, - ) -> None: - logger.debug("Initializing ScoringRouter") - self.routing_table = routing_table - - async def initialize(self) -> None: - logger.debug("ScoringRouter.initialize") - pass - - async def shutdown(self) -> None: - logger.debug("ScoringRouter.shutdown") - pass - - async def score_batch( - self, - dataset_id: str, - scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, - save_results_dataset: bool = False, - ) -> ScoreBatchResponse: - logger.debug(f"ScoringRouter.score_batch: {dataset_id}") - res = {} - for fn_identifier in scoring_functions.keys(): - score_response = await self.routing_table.get_provider_impl(fn_identifier).score_batch( - dataset_id=dataset_id, - scoring_functions={fn_identifier: scoring_functions[fn_identifier]}, - ) - res.update(score_response.results) - - if save_results_dataset: - raise NotImplementedError("Save results dataset not implemented yet") - - return ScoreBatchResponse( - results=res, - ) - - async def score( - self, - input_rows: List[Dict[str, Any]], - scoring_functions: Dict[str, Optional[ScoringFnParams]] = None, - ) -> ScoreResponse: - logger.debug(f"ScoringRouter.score: {len(input_rows)} rows, {len(scoring_functions)} functions") - res = {} - # look up and map each scoring function to its provider impl - for fn_identifier in scoring_functions.keys(): - score_response = await self.routing_table.get_provider_impl(fn_identifier).score( - input_rows=input_rows, - scoring_functions={fn_identifier: scoring_functions[fn_identifier]}, - ) - res.update(score_response.results) - - return ScoreResponse(results=res) - - -class EvalRouter(Eval): - def __init__( - self, - routing_table: RoutingTable, - ) -> None: - logger.debug("Initializing EvalRouter") - self.routing_table = routing_table - - async def initialize(self) -> None: - logger.debug("EvalRouter.initialize") - pass - - async def shutdown(self) -> None: - logger.debug("EvalRouter.shutdown") - pass - - async def run_eval( - self, - benchmark_id: str, - benchmark_config: BenchmarkConfig, - ) -> Job: - logger.debug(f"EvalRouter.run_eval: {benchmark_id}") - return await self.routing_table.get_provider_impl(benchmark_id).run_eval( - benchmark_id=benchmark_id, - benchmark_config=benchmark_config, - ) - - async def evaluate_rows( - self, - benchmark_id: str, - input_rows: List[Dict[str, Any]], - scoring_functions: List[str], - benchmark_config: BenchmarkConfig, - ) -> EvaluateResponse: - logger.debug(f"EvalRouter.evaluate_rows: {benchmark_id}, {len(input_rows)} rows") - return await self.routing_table.get_provider_impl(benchmark_id).evaluate_rows( - benchmark_id=benchmark_id, - input_rows=input_rows, - scoring_functions=scoring_functions, - benchmark_config=benchmark_config, - ) - - async def job_status( - self, - benchmark_id: str, - job_id: str, - ) -> Optional[JobStatus]: - logger.debug(f"EvalRouter.job_status: {benchmark_id}, {job_id}") - return await self.routing_table.get_provider_impl(benchmark_id).job_status(benchmark_id, job_id) - - async def job_cancel( - self, - benchmark_id: str, - job_id: str, - ) -> None: - logger.debug(f"EvalRouter.job_cancel: {benchmark_id}, {job_id}") - await self.routing_table.get_provider_impl(benchmark_id).job_cancel( - benchmark_id, - job_id, - ) - - async def job_result( - self, - benchmark_id: str, - job_id: str, - ) -> EvaluateResponse: - logger.debug(f"EvalRouter.job_result: {benchmark_id}, {job_id}") - return await self.routing_table.get_provider_impl(benchmark_id).job_result( - benchmark_id, - job_id, - ) - - class ToolRuntimeRouter(ToolRuntime): class RagToolImpl(RAGToolRuntime): def __init__( diff --git a/llama_stack/distribution/routers/routing_tables.py b/llama_stack/distribution/routers/routing_tables.py index 3e44d2926..55c4ed85f 100644 --- a/llama_stack/distribution/routers/routing_tables.py +++ b/llama_stack/distribution/routers/routing_tables.py @@ -12,7 +12,6 @@ from pydantic import TypeAdapter from llama_stack.apis.benchmarks import Benchmark, Benchmarks, ListBenchmarksResponse from llama_stack.apis.common.content_types import URL -from llama_stack.apis.common.type_system import ParamType from llama_stack.apis.datasets import ( Dataset, DatasetPurpose, @@ -23,12 +22,6 @@ from llama_stack.apis.datasets import ( ) from llama_stack.apis.models import ListModelsResponse, Model, Models, ModelType from llama_stack.apis.resource import ResourceType -from llama_stack.apis.scoring_functions import ( - ListScoringFunctionsResponse, - ScoringFn, - ScoringFnParams, - ScoringFunctions, -) from llama_stack.apis.shields import ListShieldsResponse, Shield, Shields from llama_stack.apis.tools import ( ListToolGroupsResponse, @@ -68,10 +61,6 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable return await p.register_vector_db(obj) elif api == Api.datasetio: return await p.register_dataset(obj) - elif api == Api.scoring: - return await p.register_scoring_function(obj) - elif api == Api.eval: - return await p.register_benchmark(obj) elif api == Api.tool_runtime: return await p.register_tool(obj) else: @@ -117,7 +106,7 @@ class CommonRoutingTableImpl(RoutingTable): await self.dist_registry.register(obj) # Register all objects from providers - for pid, p in self.impls_by_provider_id.items(): + for _pid, p in self.impls_by_provider_id.items(): api = get_impl_api(p) if api == Api.inference: p.model_store = self @@ -127,12 +116,6 @@ class CommonRoutingTableImpl(RoutingTable): p.vector_db_store = self elif api == Api.datasetio: p.dataset_store = self - elif api == Api.scoring: - p.scoring_function_store = self - scoring_functions = await p.list_scoring_functions() - await add_objects(scoring_functions, pid, ScoringFn) - elif api == Api.eval: - p.benchmark_store = self elif api == Api.tool_runtime: p.tool_store = self @@ -150,8 +133,6 @@ class CommonRoutingTableImpl(RoutingTable): return ("VectorIO", "vector_db") elif isinstance(self, DatasetsRoutingTable): return ("DatasetIO", "dataset") - elif isinstance(self, ScoringFunctionsRoutingTable): - return ("Scoring", "scoring_function") elif isinstance(self, BenchmarksRoutingTable): return ("Eval", "benchmark") elif isinstance(self, ToolGroupsRoutingTable): @@ -416,46 +397,6 @@ class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets): await self.unregister_object(dataset) -class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions): - async def list_scoring_functions(self) -> ListScoringFunctionsResponse: - return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value)) - - async def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: - scoring_fn = await self.get_object_by_identifier("scoring_function", scoring_fn_id) - if scoring_fn is None: - raise ValueError(f"Scoring function '{scoring_fn_id}' not found") - return scoring_fn - - async def register_scoring_function( - self, - scoring_fn_id: str, - description: str, - return_type: ParamType, - provider_scoring_fn_id: Optional[str] = None, - provider_id: Optional[str] = None, - params: Optional[ScoringFnParams] = None, - ) -> None: - if provider_scoring_fn_id is None: - provider_scoring_fn_id = scoring_fn_id - if provider_id is None: - if len(self.impls_by_provider_id) == 1: - provider_id = list(self.impls_by_provider_id.keys())[0] - else: - raise ValueError( - "No provider specified and multiple providers available. Please specify a provider_id." - ) - scoring_fn = ScoringFn( - identifier=scoring_fn_id, - description=description, - return_type=return_type, - provider_resource_id=provider_scoring_fn_id, - provider_id=provider_id, - params=params, - ) - scoring_fn.provider_id = provider_id - await self.register_object(scoring_fn) - - class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks): async def list_benchmarks(self) -> ListBenchmarksResponse: return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark")) diff --git a/llama_stack/distribution/stack.py b/llama_stack/distribution/stack.py index ccd75f6f6..90f55fc87 100644 --- a/llama_stack/distribution/stack.py +++ b/llama_stack/distribution/stack.py @@ -17,7 +17,6 @@ from llama_stack.apis.batch_inference import BatchInference from llama_stack.apis.benchmarks import Benchmarks from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets -from llama_stack.apis.eval import Eval from llama_stack.apis.evaluation import Evaluation from llama_stack.apis.files import Files from llama_stack.apis.graders import Graders @@ -27,8 +26,6 @@ from llama_stack.apis.models import Models from llama_stack.apis.post_training import PostTraining from llama_stack.apis.providers import Providers from llama_stack.apis.safety import Safety -from llama_stack.apis.scoring import Scoring -from llama_stack.apis.scoring_functions import ScoringFunctions from llama_stack.apis.shields import Shields from llama_stack.apis.synthetic_data_generation import SyntheticDataGeneration from llama_stack.apis.telemetry import Telemetry @@ -69,9 +66,6 @@ class LlamaStack( Files, Graders, Evaluation, - Eval, - ScoringFunctions, - Scoring, ): pass @@ -81,12 +75,6 @@ RESOURCES = [ ("shields", Api.shields, "register_shield", "list_shields"), ("vector_dbs", Api.vector_dbs, "register_vector_db", "list_vector_dbs"), ("datasets", Api.datasets, "register_dataset", "list_datasets"), - ( - "scoring_fns", - Api.scoring_functions, - "register_scoring_function", - "list_scoring_functions", - ), ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks"), ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups"), ] diff --git a/llama_stack/distribution/ui/modules/api.py b/llama_stack/distribution/ui/modules/api.py index 40caccda0..1746a8a4f 100644 --- a/llama_stack/distribution/ui/modules/api.py +++ b/llama_stack/distribution/ui/modules/api.py @@ -26,7 +26,10 @@ class LlamaStackApi: """Run scoring on a single row""" if not scoring_params: scoring_params = {fn_id: None for fn_id in scoring_function_ids} - return self.client.scoring.score(input_rows=[row], scoring_functions=scoring_params) + + # TODO(xiyan): fix this + # return self.client.scoring.score(input_rows=[row], scoring_functions=scoring_params) + raise NotImplementedError("Scoring is not implemented") llama_stack_api = LlamaStackApi() diff --git a/llama_stack/distribution/ui/page/distribution/resources.py b/llama_stack/distribution/ui/page/distribution/resources.py index 5e10e6e80..28f35fbd0 100644 --- a/llama_stack/distribution/ui/page/distribution/resources.py +++ b/llama_stack/distribution/ui/page/distribution/resources.py @@ -9,7 +9,6 @@ from streamlit_option_menu import option_menu from llama_stack.distribution.ui.page.distribution.datasets import datasets from llama_stack.distribution.ui.page.distribution.eval_tasks import benchmarks from llama_stack.distribution.ui.page.distribution.models import models -from llama_stack.distribution.ui.page.distribution.scoring_functions import scoring_functions from llama_stack.distribution.ui.page.distribution.shields import shields from llama_stack.distribution.ui.page.distribution.vector_dbs import vector_dbs @@ -43,8 +42,9 @@ def resources_page(): datasets() elif selected_resource == "Models": models() - elif selected_resource == "Scoring Functions": - scoring_functions() + # TODO(xiyan): fix this + # elif selected_resource == "Scoring Functions": + # scoring_functions() elif selected_resource == "Shields": shields() diff --git a/llama_stack/providers/datatypes.py b/llama_stack/providers/datatypes.py index 384582423..76873d188 100644 --- a/llama_stack/providers/datatypes.py +++ b/llama_stack/providers/datatypes.py @@ -13,7 +13,6 @@ from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasets import Dataset from llama_stack.apis.datatypes import Api from llama_stack.apis.models import Model -from llama_stack.apis.scoring_functions import ScoringFn from llama_stack.apis.shields import Shield from llama_stack.apis.tools import Tool from llama_stack.apis.vector_dbs import VectorDB @@ -42,12 +41,6 @@ class DatasetsProtocolPrivate(Protocol): async def unregister_dataset(self, dataset_id: str) -> None: ... -class ScoringFunctionsProtocolPrivate(Protocol): - async def list_scoring_functions(self) -> List[ScoringFn]: ... - - async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ... - - class BenchmarksProtocolPrivate(Protocol): async def register_benchmark(self, benchmark: Benchmark) -> None: ... diff --git a/llama_stack/providers/inline/eval/meta_reference/__init__.py b/llama_stack/providers/inline/eval/meta_reference/__init__.py index e2a7fc2cd..576a5682b 100644 --- a/llama_stack/providers/inline/eval/meta_reference/__init__.py +++ b/llama_stack/providers/inline/eval/meta_reference/__init__.py @@ -20,7 +20,6 @@ async def get_provider_impl( config, deps[Api.datasetio], deps[Api.datasets], - deps[Api.scoring], deps[Api.inference], deps[Api.agents], ) diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py index 3630d4c03..64a4c0946 100644 --- a/llama_stack/providers/inline/eval/meta_reference/eval.py +++ b/llama_stack/providers/inline/eval/meta_reference/eval.py @@ -13,7 +13,6 @@ from llama_stack.apis.benchmarks import Benchmark from llama_stack.apis.datasetio import DatasetIO from llama_stack.apis.datasets import Datasets from llama_stack.apis.inference import Inference, SystemMessage, UserMessage -from llama_stack.apis.scoring import Scoring from llama_stack.providers.datatypes import BenchmarksProtocolPrivate from llama_stack.providers.inline.agents.meta_reference.agent_instance import ( MEMORY_QUERY_TOOL, @@ -37,14 +36,14 @@ class MetaReferenceEvalImpl( config: MetaReferenceEvalConfig, datasetio_api: DatasetIO, datasets_api: Datasets, - scoring_api: Scoring, inference_api: Inference, agents_api: Agents, ) -> None: self.config = config self.datasetio_api = datasetio_api self.datasets_api = datasets_api - self.scoring_api = scoring_api + # TODO(xiyan): this implementation will be refactored + self.scoring_api = None self.inference_api = inference_api self.agents_api = agents_api diff --git a/llama_stack/providers/inline/evaluation/meta_reference/__init__.py b/llama_stack/providers/inline/evaluation/meta_reference/__init__.py new file mode 100644 index 000000000..bf5f5a6fa --- /dev/null +++ b/llama_stack/providers/inline/evaluation/meta_reference/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict + +from llama_stack.distribution.datatypes import Api + +from .config import MetaReferenceEvaluationConfig + + +async def get_provider_impl( + config: MetaReferenceEvaluationConfig, + deps: Dict[Api, Any], +): + from .evaluation import MetaReferenceEvaluationImpl + + impl = MetaReferenceEvaluationImpl( + config, + deps[Api.datasetio], + deps[Api.datasets], + deps[Api.inference], + deps[Api.agents], + ) + await impl.initialize() + return impl diff --git a/llama_stack/providers/inline/evaluation/meta_reference/config.py b/llama_stack/providers/inline/evaluation/meta_reference/config.py new file mode 100644 index 000000000..653e3b5c7 --- /dev/null +++ b/llama_stack/providers/inline/evaluation/meta_reference/config.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. +from typing import Any, Dict + +from pydantic import BaseModel + +from llama_stack.providers.utils.kvstore.config import ( + KVStoreConfig, + SqliteKVStoreConfig, +) + + +class MetaReferenceEvaluationConfig(BaseModel): + kvstore: KVStoreConfig + + @classmethod + def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> Dict[str, Any]: + return { + "kvstore": SqliteKVStoreConfig.sample_run_config( + __distro_dir__=__distro_dir__, + db_name="meta_reference_evaluation.db", + ) + } diff --git a/llama_stack/providers/inline/evaluation/meta_reference/evaluation.py b/llama_stack/providers/inline/evaluation/meta_reference/evaluation.py new file mode 100644 index 000000000..f1be056a9 --- /dev/null +++ b/llama_stack/providers/inline/evaluation/meta_reference/evaluation.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.agents import Agents +from llama_stack.apis.datasetio import DatasetIO +from llama_stack.apis.datasets import Datasets +from llama_stack.apis.inference import Inference +from llama_stack.providers.datatypes import BenchmarksProtocolPrivate + +from .....apis.benchmarks import Benchmark +from .....apis.evaluation.evaluation import ( + Evaluation, + EvaluationCandidate, + EvaluationJob, + EvaluationResponse, + EvaluationTask, +) +from .config import MetaReferenceEvaluationConfig + +EVAL_TASKS_PREFIX = "benchmarks:" + + +class MetaReferenceEvaluationImpl( + Evaluation, + BenchmarksProtocolPrivate, +): + def __init__( + self, + config: MetaReferenceEvaluationConfig, + datasetio_api: DatasetIO, + datasets_api: Datasets, + inference_api: Inference, + agents_api: Agents, + ) -> None: + self.config = config + self.datasetio_api = datasetio_api + self.datasets_api = datasets_api + self.inference_api = inference_api + self.agents_api = agents_api + + async def initialize(self) -> None: + pass + + async def shutdown(self) -> None: + pass + + async def register_benchmark(self, benchmark: Benchmark) -> None: + pass + + async def run( + self, + task: EvaluationTask, + candidate: EvaluationCandidate, + ) -> EvaluationJob: + raise NotImplementedError("Run is not implemented yet") + + async def run_sync( + self, + task: EvaluationTask, + candidate: EvaluationCandidate, + ) -> EvaluationResponse: + raise NotImplementedError("Run sync is not implemented yet") + + async def grade(self, task: EvaluationTask) -> EvaluationJob: + raise NotImplementedError("Grade is not implemented yet") + + async def grade_sync(self, task: EvaluationTask) -> EvaluationResponse: + raise NotImplementedError("Grade sync is not implemented yet") diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py deleted file mode 100644 index 755d30382..000000000 --- a/llama_stack/providers/registry/eval.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import List - -from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec - - -def available_providers() -> List[ProviderSpec]: - return [ - InlineProviderSpec( - api=Api.eval, - provider_type="inline::meta-reference", - pip_packages=["tree_sitter"], - module="llama_stack.providers.inline.eval.meta_reference", - config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig", - api_dependencies=[ - Api.datasetio, - Api.datasets, - Api.scoring, - Api.inference, - Api.agents, - ], - ), - ] diff --git a/llama_stack/providers/registry/evaluation.py b/llama_stack/providers/registry/evaluation.py new file mode 100644 index 000000000..2481cea5a --- /dev/null +++ b/llama_stack/providers/registry/evaluation.py @@ -0,0 +1,36 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import List + +from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec +from llama_stack.providers.utils.kvstore import kvstore_dependencies + + +def available_providers() -> List[ProviderSpec]: + return [ + InlineProviderSpec( + api=Api.evaluation, + provider_type="inline::meta-reference", + pip_packages=[ + "matplotlib", + "pillow", + "pandas", + "scikit-learn", + ] + + kvstore_dependencies(), + module="llama_stack.providers.inline.evaluation.meta_reference", + config_class="llama_stack.providers.inline.evaluation.meta_reference.MetaReferenceEvaluationImplConfig", + api_dependencies=[ + Api.inference, + Api.safety, + Api.vector_io, + Api.vector_dbs, + Api.tool_runtime, + Api.tool_groups, + ], + ), + ] diff --git a/llama_stack/providers/registry/scoring.py b/llama_stack/providers/registry/scoring.py deleted file mode 100644 index ca09be984..000000000 --- a/llama_stack/providers/registry/scoring.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the terms described in the LICENSE file in -# the root directory of this source tree. - -from typing import List - -from llama_stack.providers.datatypes import Api, InlineProviderSpec, ProviderSpec - - -def available_providers() -> List[ProviderSpec]: - return [ - InlineProviderSpec( - api=Api.scoring, - provider_type="inline::basic", - pip_packages=[], - module="llama_stack.providers.inline.scoring.basic", - config_class="llama_stack.providers.inline.scoring.basic.BasicScoringConfig", - api_dependencies=[ - Api.datasetio, - Api.datasets, - ], - ), - InlineProviderSpec( - api=Api.scoring, - provider_type="inline::llm-as-judge", - pip_packages=[], - module="llama_stack.providers.inline.scoring.llm_as_judge", - config_class="llama_stack.providers.inline.scoring.llm_as_judge.LlmAsJudgeScoringConfig", - api_dependencies=[ - Api.datasetio, - Api.datasets, - Api.inference, - ], - ), - InlineProviderSpec( - api=Api.scoring, - provider_type="inline::braintrust", - pip_packages=["autoevals", "openai"], - module="llama_stack.providers.inline.scoring.braintrust", - config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig", - api_dependencies=[ - Api.datasetio, - Api.datasets, - ], - provider_data_validator="llama_stack.providers.inline.scoring.braintrust.BraintrustProviderDataValidator", - ), - ] diff --git a/llama_stack/providers/utils/common/data_schema_validator.py b/llama_stack/providers/utils/common/data_schema_validator.py index eb9d9dd60..95663a4e9 100644 --- a/llama_stack/providers/utils/common/data_schema_validator.py +++ b/llama_stack/providers/utils/common/data_schema_validator.py @@ -5,14 +5,12 @@ # the root directory of this source tree. from enum import Enum -from typing import Any, Dict, List from llama_stack.apis.common.type_system import ( ChatCompletionInputType, CompletionInputType, StringType, ) -from llama_stack.distribution.datatypes import Api class ColumnName(Enum): @@ -75,29 +73,31 @@ VALID_SCHEMAS_FOR_EVAL = [ ] -def get_valid_schemas(api_str: str): - if api_str == Api.scoring.value: - return VALID_SCHEMAS_FOR_SCORING - elif api_str == Api.eval.value: - return VALID_SCHEMAS_FOR_EVAL - else: - raise ValueError(f"Invalid API string: {api_str}") +# TODO(xiyan): add this back + +# def get_valid_schemas(api_str: str): +# if api_str == Api.scoring.value: +# return VALID_SCHEMAS_FOR_SCORING +# elif api_str == Api.eval.value: +# return VALID_SCHEMAS_FOR_EVAL +# else: +# raise ValueError(f"Invalid API string: {api_str}") -def validate_dataset_schema( - dataset_schema: Dict[str, Any], - expected_schemas: List[Dict[str, Any]], -): - if dataset_schema not in expected_schemas: - raise ValueError(f"Dataset {dataset_schema} does not have a correct input schema in {expected_schemas}") +# def validate_dataset_schema( +# dataset_schema: Dict[str, Any], +# expected_schemas: List[Dict[str, Any]], +# ): +# if dataset_schema not in expected_schemas: +# raise ValueError(f"Dataset {dataset_schema} does not have a correct input schema in {expected_schemas}") -def validate_row_schema( - input_row: Dict[str, Any], - expected_schemas: List[Dict[str, Any]], -): - for schema in expected_schemas: - if all(key in input_row for key in schema): - return +# def validate_row_schema( +# input_row: Dict[str, Any], +# expected_schemas: List[Dict[str, Any]], +# ): +# for schema in expected_schemas: +# if all(key in input_row for key in schema): +# return - raise ValueError(f"Input row {input_row} does not match any of the expected schemas in {expected_schemas}") +# raise ValueError(f"Input row {input_row} does not match any of the expected schemas in {expected_schemas}") diff --git a/llama_stack/templates/bedrock/bedrock.py b/llama_stack/templates/bedrock/bedrock.py index 9171ae18a..5a30e7189 100644 --- a/llama_stack/templates/bedrock/bedrock.py +++ b/llama_stack/templates/bedrock/bedrock.py @@ -9,7 +9,11 @@ from pathlib import Path from llama_stack.distribution.datatypes import Provider, ToolGroupInput from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.remote.inference.bedrock.models import MODEL_ENTRIES -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_distribution_template() -> DistributionTemplate: @@ -19,9 +23,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["remote::bedrock"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/bedrock/build.yaml b/llama_stack/templates/bedrock/build.yaml index 6c07b0478..209cd8e34 100644 --- a/llama_stack/templates/bedrock/build.yaml +++ b/llama_stack/templates/bedrock/build.yaml @@ -14,15 +14,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml index 39ed8cf48..ae6357345 100644 --- a/llama_stack/templates/bedrock/run.yaml +++ b/llama_stack/templates/bedrock/run.yaml @@ -3,10 +3,8 @@ image_name: bedrock apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -42,14 +40,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/bedrock/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -65,17 +55,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/bedrock}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -133,7 +112,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/cerebras/build.yaml b/llama_stack/templates/cerebras/build.yaml index ef6c43212..5fe4a6bf0 100644 --- a/llama_stack/templates/cerebras/build.yaml +++ b/llama_stack/templates/cerebras/build.yaml @@ -13,15 +13,9 @@ distribution_spec: - remote::pgvector agents: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust telemetry: - inline::meta-reference tool_runtime: diff --git a/llama_stack/templates/cerebras/cerebras.py b/llama_stack/templates/cerebras/cerebras.py index 4a9ad90b4..beacfc521 100644 --- a/llama_stack/templates/cerebras/cerebras.py +++ b/llama_stack/templates/cerebras/cerebras.py @@ -14,7 +14,11 @@ from llama_stack.providers.inline.inference.sentence_transformers import ( from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.remote.inference.cerebras import CerebrasImplConfig from llama_stack.providers.remote.inference.cerebras.models import MODEL_ENTRIES -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_distribution_template() -> DistributionTemplate: @@ -23,9 +27,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "agents": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "telemetry": ["inline::meta-reference"], "tool_runtime": [ "remote::brave-search", diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml index 8315f75d5..fb3d7ec9b 100644 --- a/llama_stack/templates/cerebras/run.yaml +++ b/llama_stack/templates/cerebras/run.yaml @@ -3,10 +3,8 @@ image_name: cerebras apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -41,14 +39,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/agents_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -64,17 +54,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/cerebras}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} telemetry: - provider_id: meta-reference provider_type: inline::meta-reference @@ -131,7 +110,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/ci-tests/build.yaml b/llama_stack/templates/ci-tests/build.yaml index a5c615f2f..3c6ff6924 100644 --- a/llama_stack/templates/ci-tests/build.yaml +++ b/llama_stack/templates/ci-tests/build.yaml @@ -15,15 +15,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/ci-tests/ci_tests.py b/llama_stack/templates/ci-tests/ci_tests.py index b204af5ea..efb9647f7 100644 --- a/llama_stack/templates/ci-tests/ci_tests.py +++ b/llama_stack/templates/ci-tests/ci_tests.py @@ -15,10 +15,16 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig +from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( + SQLiteVectorIOConfig, +) from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_distribution_template() -> DistributionTemplate: @@ -28,9 +34,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/ci-tests/run.yaml b/llama_stack/templates/ci-tests/run.yaml index ae2b3912c..6b351ac9c 100644 --- a/llama_stack/templates/ci-tests/run.yaml +++ b/llama_stack/templates/ci-tests/run.yaml @@ -3,10 +3,8 @@ image_name: ci-tests apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -45,14 +43,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ci-tests/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -68,17 +58,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ci-tests}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -209,7 +188,6 @@ shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/dell/build.yaml b/llama_stack/templates/dell/build.yaml index 05b98d56f..12183da9e 100644 --- a/llama_stack/templates/dell/build.yaml +++ b/llama_stack/templates/dell/build.yaml @@ -16,15 +16,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/dell/dell.py b/llama_stack/templates/dell/dell.py index 52c5a5476..161a611ae 100644 --- a/llama_stack/templates/dell/dell.py +++ b/llama_stack/templates/dell/dell.py @@ -24,9 +24,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml index 8a62a5a42..ff074659b 100644 --- a/llama_stack/templates/dell/run-with-safety.yaml +++ b/llama_stack/templates/dell/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: dell apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -120,7 +99,6 @@ shields: - shield_id: ${env.SAFETY_MODEL} vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml index 31c63bd83..762769349 100644 --- a/llama_stack/templates/dell/run.yaml +++ b/llama_stack/templates/dell/run.yaml @@ -3,10 +3,8 @@ image_name: dell apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -44,14 +42,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dell/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -67,17 +57,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dell}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -111,7 +90,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/dev/build.yaml b/llama_stack/templates/dev/build.yaml index 726ebccca..c98972dac 100644 --- a/llama_stack/templates/dev/build.yaml +++ b/llama_stack/templates/dev/build.yaml @@ -19,15 +19,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/dev/dev.py b/llama_stack/templates/dev/dev.py index 1aee1bb22..36ab22188 100644 --- a/llama_stack/templates/dev/dev.py +++ b/llama_stack/templates/dev/dev.py @@ -16,20 +16,38 @@ from llama_stack.distribution.datatypes import ( from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) -from llama_stack.providers.inline.vector_io.sqlite_vec.config import SQLiteVectorIOConfig +from llama_stack.providers.inline.vector_io.sqlite_vec.config import ( + SQLiteVectorIOConfig, +) from llama_stack.providers.remote.inference.anthropic.config import AnthropicConfig -from llama_stack.providers.remote.inference.anthropic.models import MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES +from llama_stack.providers.remote.inference.anthropic.models import ( + MODEL_ENTRIES as ANTHROPIC_MODEL_ENTRIES, +) from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig -from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES +from llama_stack.providers.remote.inference.fireworks.models import ( + MODEL_ENTRIES as FIREWORKS_MODEL_ENTRIES, +) from llama_stack.providers.remote.inference.gemini.config import GeminiConfig -from llama_stack.providers.remote.inference.gemini.models import MODEL_ENTRIES as GEMINI_MODEL_ENTRIES +from llama_stack.providers.remote.inference.gemini.models import ( + MODEL_ENTRIES as GEMINI_MODEL_ENTRIES, +) from llama_stack.providers.remote.inference.groq.config import GroqConfig -from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES as GROQ_MODEL_ENTRIES +from llama_stack.providers.remote.inference.groq.models import ( + MODEL_ENTRIES as GROQ_MODEL_ENTRIES, +) from llama_stack.providers.remote.inference.openai.config import OpenAIConfig -from llama_stack.providers.remote.inference.openai.models import MODEL_ENTRIES as OPENAI_MODEL_ENTRIES +from llama_stack.providers.remote.inference.openai.models import ( + MODEL_ENTRIES as OPENAI_MODEL_ENTRIES, +) from llama_stack.providers.remote.vector_io.chroma.config import ChromaVectorIOConfig -from llama_stack.providers.remote.vector_io.pgvector.config import PGVectorVectorIOConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.providers.remote.vector_io.pgvector.config import ( + PGVectorVectorIOConfig, +) +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_inference_providers() -> Tuple[List[Provider], List[ModelInput]]: @@ -83,9 +101,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/dev/run.yaml b/llama_stack/templates/dev/run.yaml index dba13b357..dbffbf215 100644 --- a/llama_stack/templates/dev/run.yaml +++ b/llama_stack/templates/dev/run.yaml @@ -3,10 +3,8 @@ image_name: dev apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -74,14 +72,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/dev/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -97,17 +87,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/dev}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -365,7 +344,6 @@ shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/fireworks/build.yaml b/llama_stack/templates/fireworks/build.yaml index 3907eba78..c5904a7e3 100644 --- a/llama_stack/templates/fireworks/build.yaml +++ b/llama_stack/templates/fireworks/build.yaml @@ -15,15 +15,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/fireworks/fireworks.py b/llama_stack/templates/fireworks/fireworks.py index 3e6d1ca89..9b33ebc7b 100644 --- a/llama_stack/templates/fireworks/fireworks.py +++ b/llama_stack/templates/fireworks/fireworks.py @@ -19,7 +19,11 @@ from llama_stack.providers.inline.inference.sentence_transformers import ( from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.remote.inference.fireworks.config import FireworksImplConfig from llama_stack.providers.remote.inference.fireworks.models import MODEL_ENTRIES -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_distribution_template() -> DistributionTemplate: @@ -29,9 +33,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml index 2d79a3548..b89323dbc 100644 --- a/llama_stack/templates/fireworks/run-with-safety.yaml +++ b/llama_stack/templates/fireworks/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: fireworks apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -53,14 +51,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -76,17 +66,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -226,7 +205,6 @@ shields: provider_id: code-scanner vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml index 285495ad9..38f3bb67f 100644 --- a/llama_stack/templates/fireworks/run.yaml +++ b/llama_stack/templates/fireworks/run.yaml @@ -3,10 +3,8 @@ image_name: fireworks apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/fireworks/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/fireworks}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -216,7 +195,6 @@ shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/groq/build.yaml b/llama_stack/templates/groq/build.yaml index 3263ce83b..6a92d0b01 100644 --- a/llama_stack/templates/groq/build.yaml +++ b/llama_stack/templates/groq/build.yaml @@ -12,15 +12,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/groq/groq.py b/llama_stack/templates/groq/groq.py index 71c504cde..b51cceb0e 100644 --- a/llama_stack/templates/groq/groq.py +++ b/llama_stack/templates/groq/groq.py @@ -7,17 +7,17 @@ from pathlib import Path from llama_stack.apis.models.models import ModelType -from llama_stack.distribution.datatypes import ( - ModelInput, - Provider, - ToolGroupInput, -) +from llama_stack.distribution.datatypes import ModelInput, Provider, ToolGroupInput from llama_stack.providers.inline.inference.sentence_transformers import ( SentenceTransformersInferenceConfig, ) from llama_stack.providers.remote.inference.groq import GroqConfig from llama_stack.providers.remote.inference.groq.models import MODEL_ENTRIES -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_distribution_template() -> DistributionTemplate: @@ -27,9 +27,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/groq/run.yaml b/llama_stack/templates/groq/run.yaml index 6afea2355..441e2bda0 100644 --- a/llama_stack/templates/groq/run.yaml +++ b/llama_stack/templates/groq/run.yaml @@ -3,10 +3,8 @@ image_name: groq apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/groq/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/groq}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -156,7 +135,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/hf-endpoint/build.yaml b/llama_stack/templates/hf-endpoint/build.yaml index c2eaaa05b..0b6c072aa 100644 --- a/llama_stack/templates/hf-endpoint/build.yaml +++ b/llama_stack/templates/hf-endpoint/build.yaml @@ -14,15 +14,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/hf-endpoint/hf_endpoint.py b/llama_stack/templates/hf-endpoint/hf_endpoint.py index 0dafe0a01..8fd31cb1d 100644 --- a/llama_stack/templates/hf-endpoint/hf_endpoint.py +++ b/llama_stack/templates/hf-endpoint/hf_endpoint.py @@ -26,9 +26,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml index f6f23a987..ce15f76f9 100644 --- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml +++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: hf-endpoint apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -53,14 +51,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -76,17 +66,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -128,7 +107,6 @@ shields: - shield_id: ${env.SAFETY_MODEL} vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml index 461f97128..5cd91e64a 100644 --- a/llama_stack/templates/hf-endpoint/run.yaml +++ b/llama_stack/templates/hf-endpoint/run.yaml @@ -3,10 +3,8 @@ image_name: hf-endpoint apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-endpoint/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-endpoint}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -118,7 +97,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/hf-serverless/build.yaml b/llama_stack/templates/hf-serverless/build.yaml index c0cc1e2c2..2fff4a7d3 100644 --- a/llama_stack/templates/hf-serverless/build.yaml +++ b/llama_stack/templates/hf-serverless/build.yaml @@ -15,15 +15,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/hf-serverless/hf_serverless.py b/llama_stack/templates/hf-serverless/hf_serverless.py index 25d4c6b30..e1537a681 100644 --- a/llama_stack/templates/hf-serverless/hf_serverless.py +++ b/llama_stack/templates/hf-serverless/hf_serverless.py @@ -26,9 +26,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml index 7f1724f34..c6bfa291b 100644 --- a/llama_stack/templates/hf-serverless/run-with-safety.yaml +++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: hf-serverless apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -53,14 +51,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -76,17 +66,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -128,7 +107,6 @@ shields: - shield_id: ${env.SAFETY_MODEL} vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml index ac013488b..799c9845b 100644 --- a/llama_stack/templates/hf-serverless/run.yaml +++ b/llama_stack/templates/hf-serverless/run.yaml @@ -3,10 +3,8 @@ image_name: hf-serverless apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/hf-serverless/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/hf-serverless}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -118,7 +97,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/meta-reference-gpu/build.yaml b/llama_stack/templates/meta-reference-gpu/build.yaml index b9130fc7d..0c8da8280 100644 --- a/llama_stack/templates/meta-reference-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-gpu/build.yaml @@ -14,15 +14,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/meta-reference-gpu/meta_reference.py b/llama_stack/templates/meta-reference-gpu/meta_reference.py index 6bb1fcb0a..12a5013cb 100644 --- a/llama_stack/templates/meta-reference-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-gpu/meta_reference.py @@ -30,9 +30,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml index 190c08494..82744f307 100644 --- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml +++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: meta-reference-gpu apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -55,14 +53,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -78,17 +68,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -130,7 +109,6 @@ shields: - shield_id: ${env.SAFETY_MODEL} vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml index 07763a4df..1b710c37c 100644 --- a/llama_stack/templates/meta-reference-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-gpu/run.yaml @@ -3,10 +3,8 @@ image_name: meta-reference-gpu apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -49,14 +47,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-gpu/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -72,17 +62,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-gpu}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -119,7 +98,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml index 7bbcfe5f2..a55d3ddb4 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/build.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/build.yaml @@ -14,15 +14,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py index 5f207bfad..de16930e4 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py +++ b/llama_stack/templates/meta-reference-quantized-gpu/meta_reference.py @@ -25,9 +25,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml index 51b9dc250..13e9177fd 100644 --- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml +++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml @@ -3,10 +3,8 @@ image_name: meta-reference-quantized-gpu apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -51,14 +49,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/meta-reference-quantized-gpu/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -74,17 +64,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/meta-reference-quantized-gpu}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -121,7 +100,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/nvidia/build.yaml b/llama_stack/templates/nvidia/build.yaml index 0c788ce86..848b6cd45 100644 --- a/llama_stack/templates/nvidia/build.yaml +++ b/llama_stack/templates/nvidia/build.yaml @@ -12,12 +12,8 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - inline::localfs - scoring: - - inline::basic tool_runtime: - inline::rag-runtime image_type: conda diff --git a/llama_stack/templates/nvidia/nvidia.py b/llama_stack/templates/nvidia/nvidia.py index 308c0e2a6..2cf8e98d4 100644 --- a/llama_stack/templates/nvidia/nvidia.py +++ b/llama_stack/templates/nvidia/nvidia.py @@ -6,11 +6,20 @@ from pathlib import Path -from llama_stack.distribution.datatypes import ModelInput, Provider, ShieldInput, ToolGroupInput +from llama_stack.distribution.datatypes import ( + ModelInput, + Provider, + ShieldInput, + ToolGroupInput, +) from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig from llama_stack.providers.remote.inference.nvidia.models import MODEL_ENTRIES from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_distribution_template() -> DistributionTemplate: @@ -20,9 +29,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["remote::nvidia"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["inline::localfs"], - "scoring": ["inline::basic"], "tool_runtime": ["inline::rag-runtime"], } diff --git a/llama_stack/templates/nvidia/run-with-safety.yaml b/llama_stack/templates/nvidia/run-with-safety.yaml index 04da1bcda..a1f6fb5f8 100644 --- a/llama_stack/templates/nvidia/run-with-safety.yaml +++ b/llama_stack/templates/nvidia/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: nvidia apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -51,14 +49,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db datasetio: - provider_id: localfs provider_type: inline::localfs @@ -67,10 +57,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} tool_runtime: - provider_id: rag-runtime provider_type: inline::rag-runtime @@ -92,7 +78,6 @@ shields: provider_id: nvidia vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::rag diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml index 3abdd82a7..4279eda08 100644 --- a/llama_stack/templates/nvidia/run.yaml +++ b/llama_stack/templates/nvidia/run.yaml @@ -3,10 +3,8 @@ image_name: nvidia apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -46,14 +44,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/nvidia/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/meta_reference_eval.db datasetio: - provider_id: localfs provider_type: inline::localfs @@ -62,10 +52,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/nvidia}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} tool_runtime: - provider_id: rag-runtime provider_type: inline::rag-runtime @@ -195,7 +181,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::rag diff --git a/llama_stack/templates/ollama/build.yaml b/llama_stack/templates/ollama/build.yaml index 37b72fc1f..d5a195d5f 100644 --- a/llama_stack/templates/ollama/build.yaml +++ b/llama_stack/templates/ollama/build.yaml @@ -14,15 +14,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/ollama/ollama.py b/llama_stack/templates/ollama/ollama.py index 2d753d3e4..a679607fa 100644 --- a/llama_stack/templates/ollama/ollama.py +++ b/llama_stack/templates/ollama/ollama.py @@ -25,9 +25,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml index 2b8eb44db..0b9e94faa 100644 --- a/llama_stack/templates/ollama/run-with-safety.yaml +++ b/llama_stack/templates/ollama/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: ollama apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -46,14 +44,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -69,17 +59,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -129,7 +108,6 @@ shields: provider_id: code-scanner vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml index c9531f417..1451ba96f 100644 --- a/llama_stack/templates/ollama/run.yaml +++ b/llama_stack/templates/ollama/run.yaml @@ -3,10 +3,8 @@ image_name: ollama apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -44,14 +42,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/ollama/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -67,17 +57,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/ollama}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -119,7 +98,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/open-benchmark/build.yaml b/llama_stack/templates/open-benchmark/build.yaml index 1db90ef27..b39a17820 100644 --- a/llama_stack/templates/open-benchmark/build.yaml +++ b/llama_stack/templates/open-benchmark/build.yaml @@ -18,15 +18,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index 03e524dae..ff5601467 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -9,7 +9,6 @@ from typing import Dict, List, Tuple from llama_stack.apis.datasets import DatasetPurpose, URIDataSource from llama_stack.apis.models.models import ModelType from llama_stack.distribution.datatypes import ( - BenchmarkInput, DatasetInput, ModelInput, Provider, @@ -102,9 +101,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", @@ -210,33 +207,35 @@ def get_distribution_template() -> DistributionTemplate: ), ] - default_benchmarks = [ - BenchmarkInput( - benchmark_id="meta-reference-simpleqa", - dataset_id="simpleqa", - grader_ids=["llm-as-judge::405b-simpleqa"], - ), - BenchmarkInput( - benchmark_id="meta-reference-mmlu-cot", - dataset_id="mmlu_cot", - grader_ids=["basic::regex_parser_multiple_choice_answer"], - ), - BenchmarkInput( - benchmark_id="meta-reference-gpqa-cot", - dataset_id="gpqa_cot", - grader_ids=["basic::regex_parser_multiple_choice_answer"], - ), - BenchmarkInput( - benchmark_id="meta-reference-math-500", - dataset_id="math_500", - grader_ids=["basic::regex_parser_math_response"], - ), - BenchmarkInput( - benchmark_id="meta-reference-bfcl", - dataset_id="bfcl", - grader_ids=["basic::bfcl"], - ), - ] + # TODO(xiyan): fix this back as registerable resources + # default_benchmarks = [ + # BenchmarkInput( + # benchmark_id="meta-reference-simpleqa", + # dataset_id="simpleqa", + # grader_ids=["llm-as-judge::405b-simpleqa"], + # ), + # BenchmarkInput( + # benchmark_id="meta-reference-mmlu-cot", + # dataset_id="mmlu_cot", + # grader_ids=["basic::regex_parser_multiple_choice_answer"], + # ), + # BenchmarkInput( + # benchmark_id="meta-reference-gpqa-cot", + # dataset_id="gpqa_cot", + # grader_ids=["basic::regex_parser_multiple_choice_answer"], + # ), + # BenchmarkInput( + # benchmark_id="meta-reference-math-500", + # dataset_id="math_500", + # grader_ids=["basic::regex_parser_math_response"], + # ), + # BenchmarkInput( + # benchmark_id="meta-reference-bfcl", + # dataset_id="bfcl", + # grader_ids=["basic::bfcl"], + # ), + # ] + return DistributionTemplate( name=name, distro_type="self_hosted", @@ -255,7 +254,6 @@ def get_distribution_template() -> DistributionTemplate: default_tool_groups=default_tool_groups, default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")], default_datasets=default_datasets, - default_benchmarks=default_benchmarks, ), }, run_config_env_vars={ diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index a3c00af56..c0671cbd1 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -3,10 +3,8 @@ image_name: open-benchmark apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -71,14 +69,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -94,17 +84,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -193,33 +172,7 @@ datasets: metadata: {} dataset_id: bfcl provider_id: huggingface -scoring_fns: [] -benchmarks: -- dataset_id: simpleqa - grader_ids: - - llm-as-judge::405b-simpleqa - metadata: {} - benchmark_id: meta-reference-simpleqa -- dataset_id: mmlu_cot - grader_ids: - - basic::regex_parser_multiple_choice_answer - metadata: {} - benchmark_id: meta-reference-mmlu-cot -- dataset_id: gpqa_cot - grader_ids: - - basic::regex_parser_multiple_choice_answer - metadata: {} - benchmark_id: meta-reference-gpqa-cot -- dataset_id: math_500 - grader_ids: - - basic::regex_parser_math_response - metadata: {} - benchmark_id: meta-reference-math-500 -- dataset_id: bfcl - grader_ids: - - basic::bfcl - metadata: {} - benchmark_id: meta-reference-bfcl +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search diff --git a/llama_stack/templates/passthrough/build.yaml b/llama_stack/templates/passthrough/build.yaml index fb1fb1066..6a44293f6 100644 --- a/llama_stack/templates/passthrough/build.yaml +++ b/llama_stack/templates/passthrough/build.yaml @@ -15,15 +15,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/passthrough/passthrough.py b/llama_stack/templates/passthrough/passthrough.py index cc3f55937..589aca229 100644 --- a/llama_stack/templates/passthrough/passthrough.py +++ b/llama_stack/templates/passthrough/passthrough.py @@ -21,10 +21,7 @@ from llama_stack.providers.remote.inference.passthrough.config import ( PassthroughImplConfig, ) from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry -from llama_stack.templates.template import ( - DistributionTemplate, - RunConfigSettings, -) +from llama_stack.templates.template import DistributionTemplate, RunConfigSettings def get_distribution_template() -> DistributionTemplate: @@ -34,9 +31,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/passthrough/run-with-safety.yaml b/llama_stack/templates/passthrough/run-with-safety.yaml index fbfa4afe7..72aced336 100644 --- a/llama_stack/templates/passthrough/run-with-safety.yaml +++ b/llama_stack/templates/passthrough/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: passthrough apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -53,14 +51,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -76,17 +66,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -139,7 +118,6 @@ shields: provider_id: code-scanner vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/passthrough/run.yaml b/llama_stack/templates/passthrough/run.yaml index 6956bc6e3..a239f2bf7 100644 --- a/llama_stack/templates/passthrough/run.yaml +++ b/llama_stack/templates/passthrough/run.yaml @@ -3,10 +3,8 @@ image_name: passthrough apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/passthrough/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/passthrough}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -129,7 +108,6 @@ shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/remote-vllm/build.yaml b/llama_stack/templates/remote-vllm/build.yaml index b2bbf853a..0437d76d6 100644 --- a/llama_stack/templates/remote-vllm/build.yaml +++ b/llama_stack/templates/remote-vllm/build.yaml @@ -13,15 +13,9 @@ distribution_spec: - inline::llama-guard agents: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust telemetry: - inline::meta-reference tool_runtime: diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml index 3830ffcdb..c462ad924 100644 --- a/llama_stack/templates/remote-vllm/run-with-safety.yaml +++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: remote-vllm apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -50,14 +48,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -73,17 +63,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} telemetry: - provider_id: meta-reference provider_type: inline::meta-reference @@ -136,7 +115,6 @@ shields: - shield_id: ${env.SAFETY_MODEL} vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml index b6bba1252..71f4ea5cc 100644 --- a/llama_stack/templates/remote-vllm/run.yaml +++ b/llama_stack/templates/remote-vllm/run.yaml @@ -3,10 +3,8 @@ image_name: remote-vllm apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -43,14 +41,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/agents_store.db - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -66,17 +56,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/remote-vllm}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} telemetry: - provider_id: meta-reference provider_type: inline::meta-reference @@ -124,7 +103,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/remote-vllm/vllm.py b/llama_stack/templates/remote-vllm/vllm.py index 9901fc83b..f43873b02 100644 --- a/llama_stack/templates/remote-vllm/vllm.py +++ b/llama_stack/templates/remote-vllm/vllm.py @@ -27,9 +27,7 @@ def get_distribution_template() -> DistributionTemplate: "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"], "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "telemetry": ["inline::meta-reference"], "tool_runtime": [ "remote::brave-search", diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml index 616d82a61..0a6f71c52 100644 --- a/llama_stack/templates/sambanova/run.yaml +++ b/llama_stack/templates/sambanova/run.yaml @@ -169,7 +169,6 @@ shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/tgi/build.yaml b/llama_stack/templates/tgi/build.yaml index 9fe79647c..e870c5eb1 100644 --- a/llama_stack/templates/tgi/build.yaml +++ b/llama_stack/templates/tgi/build.yaml @@ -15,15 +15,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml index db54c0393..2e27ccdbf 100644 --- a/llama_stack/templates/tgi/run-with-safety.yaml +++ b/llama_stack/templates/tgi/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: tgi apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -118,7 +97,6 @@ shields: - shield_id: ${env.SAFETY_MODEL} vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml index dafb59aa9..9e0bdeb21 100644 --- a/llama_stack/templates/tgi/run.yaml +++ b/llama_stack/templates/tgi/run.yaml @@ -3,10 +3,8 @@ image_name: tgi apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -47,14 +45,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/tgi/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -70,17 +60,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/tgi}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -117,7 +96,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/tgi/tgi.py b/llama_stack/templates/tgi/tgi.py index 45ea74db6..5fcf336c7 100644 --- a/llama_stack/templates/tgi/tgi.py +++ b/llama_stack/templates/tgi/tgi.py @@ -28,9 +28,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml index 834a3ecaf..8892475bb 100644 --- a/llama_stack/templates/together/build.yaml +++ b/llama_stack/templates/together/build.yaml @@ -15,15 +15,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml index e0bf46c11..c386aeb83 100644 --- a/llama_stack/templates/together/run-with-safety.yaml +++ b/llama_stack/templates/together/run-with-safety.yaml @@ -3,10 +3,8 @@ image_name: together apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -53,14 +51,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -76,17 +66,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -233,7 +212,6 @@ shields: provider_id: code-scanner vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml index 9d0acaf31..0c808eed6 100644 --- a/llama_stack/templates/together/run.yaml +++ b/llama_stack/templates/together/run.yaml @@ -3,10 +3,8 @@ image_name: together apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -48,14 +46,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/together/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -71,17 +61,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/together}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -223,7 +202,6 @@ shields: - shield_id: meta-llama/Llama-Guard-3-8B vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py index fce03a1b2..fbb9417b9 100644 --- a/llama_stack/templates/together/together.py +++ b/llama_stack/templates/together/together.py @@ -19,7 +19,11 @@ from llama_stack.providers.inline.inference.sentence_transformers import ( from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig from llama_stack.providers.remote.inference.together import TogetherImplConfig from llama_stack.providers.remote.inference.together.models import MODEL_ENTRIES -from llama_stack.templates.template import DistributionTemplate, RunConfigSettings, get_model_registry +from llama_stack.templates.template import ( + DistributionTemplate, + RunConfigSettings, + get_model_registry, +) def get_distribution_template() -> DistributionTemplate: @@ -29,9 +33,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/llama_stack/templates/vllm-gpu/build.yaml b/llama_stack/templates/vllm-gpu/build.yaml index 8eb44dc1b..93707544d 100644 --- a/llama_stack/templates/vllm-gpu/build.yaml +++ b/llama_stack/templates/vllm-gpu/build.yaml @@ -15,15 +15,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml index bf85de0a2..d3c666438 100644 --- a/llama_stack/templates/vllm-gpu/run.yaml +++ b/llama_stack/templates/vllm-gpu/run.yaml @@ -3,10 +3,8 @@ image_name: vllm-gpu apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -52,14 +50,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -75,17 +65,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -122,7 +101,6 @@ models: shields: [] vector_dbs: [] datasets: [] -scoring_fns: [] benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch diff --git a/llama_stack/templates/vllm-gpu/vllm.py b/llama_stack/templates/vllm-gpu/vllm.py index 8883f117f..2235c8642 100644 --- a/llama_stack/templates/vllm-gpu/vllm.py +++ b/llama_stack/templates/vllm-gpu/vllm.py @@ -25,9 +25,7 @@ def get_distribution_template() -> DistributionTemplate: "safety": ["inline::llama-guard"], "agents": ["inline::meta-reference"], "telemetry": ["inline::meta-reference"], - "eval": ["inline::meta-reference"], "datasetio": ["remote::huggingface", "inline::localfs"], - "scoring": ["inline::basic", "inline::llm-as-judge", "inline::braintrust"], "tool_runtime": [ "remote::brave-search", "remote::tavily-search", diff --git a/pyproject.toml b/pyproject.toml index cf4e81ab8..807da4337 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -168,7 +168,6 @@ exclude = [ "^llama_stack/apis/common/training_types\\.py$", "^llama_stack/apis/datasetio/datasetio\\.py$", "^llama_stack/apis/datasets/datasets\\.py$", - "^llama_stack/apis/eval/eval\\.py$", "^llama_stack/apis/files/files\\.py$", "^llama_stack/apis/inference/inference\\.py$", "^llama_stack/apis/inspect/inspect\\.py$", @@ -177,8 +176,6 @@ exclude = [ "^llama_stack/apis/providers/providers\\.py$", "^llama_stack/apis/resource\\.py$", "^llama_stack/apis/safety/safety\\.py$", - "^llama_stack/apis/scoring/scoring\\.py$", - "^llama_stack/apis/scoring_functions/scoring_functions\\.py$", "^llama_stack/apis/shields/shields\\.py$", "^llama_stack/apis/synthetic_data_generation/synthetic_data_generation\\.py$", "^llama_stack/apis/telemetry/telemetry\\.py$", @@ -218,6 +215,7 @@ exclude = [ "^llama_stack/providers/inline/agents/meta_reference/agent_instance\\.py$", "^llama_stack/providers/inline/agents/meta_reference/agents\\.py$", "^llama_stack/providers/inline/agents/meta_reference/safety\\.py$", + "^llama_stack/providers/inline/evaluation/meta_reference/evaluation\\.py$", "^llama_stack/providers/inline/datasetio/localfs/", "^llama_stack/providers/inline/eval/meta_reference/eval\\.py$", "^llama_stack/providers/inline/inference/meta_reference/config\\.py$", diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py index c4aa0fa1b..7a519f208 100644 --- a/tests/integration/eval/test_eval.py +++ b/tests/integration/eval/test_eval.py @@ -16,6 +16,7 @@ from ..datasets.test_datasets import data_url_from_file @pytest.mark.parametrize("scoring_fn_id", ["basic::equality"]) +@pytest.mark.skip(reason="TODO(xiyan): fix this") def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): dataset = llama_stack_client.datasets.register( purpose="eval/messages-answer", @@ -65,6 +66,7 @@ def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id): @pytest.mark.parametrize("scoring_fn_id", ["basic::subset_of"]) +@pytest.mark.skip(reason="TODO(xiyan): fix this") def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id): dataset = llama_stack_client.datasets.register( purpose="eval/messages-answer", diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py index 315ff050c..675090f7f 100644 --- a/tests/integration/scoring/test_scoring.py +++ b/tests/integration/scoring/test_scoring.py @@ -43,12 +43,14 @@ def register_scoring_function( ) +@pytest.mark.skip(reason="TODO(xiyan): fix this") def test_scoring_functions_list(llama_stack_client): response = llama_stack_client.scoring_functions.list() assert isinstance(response, list) assert len(response) > 0 +@pytest.mark.skip(reason="TODO(xiyan): fix this") def test_scoring_functions_register( llama_stack_client, sample_scoring_fn_id, @@ -81,6 +83,7 @@ def test_scoring_functions_register( @pytest.mark.parametrize("scoring_fn_id", ["basic::equality"]) +@pytest.mark.skip(reason="TODO(xiyan): fix this") def test_scoring_score(llama_stack_client, scoring_fn_id): # scoring individual rows df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv") @@ -100,6 +103,7 @@ def test_scoring_score(llama_stack_client, scoring_fn_id): assert len(response.results[x].score_rows) == len(rows) +@pytest.mark.skip(reason="TODO(xiyan): fix this") def test_scoring_score_with_params_llm_as_judge( llama_stack_client, sample_judge_prompt_template, @@ -139,6 +143,7 @@ def test_scoring_score_with_params_llm_as_judge( "braintrust", ], ) +@pytest.mark.skip(reason="TODO(xiyan): fix this") def test_scoring_score_with_aggregation_functions( llama_stack_client, sample_judge_prompt_template,