From 011fd59a29f1472704cc676c6405cdf418c02756 Mon Sep 17 00:00:00 2001 From: Xi Yan Date: Tue, 18 Mar 2025 21:46:55 -0700 Subject: [PATCH] open benchmark --- distributions/dependencies.json | 54 ------------------- .../templates/open-benchmark/build.yaml | 6 --- llama_stack/templates/open-benchmark/run.yaml | 49 +---------------- 3 files changed, 1 insertion(+), 108 deletions(-) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 33b497a33..1767523d6 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -1,7 +1,6 @@ { "bedrock": [ "aiosqlite", - "autoevals", "blobfile", "boto3", "chardet", @@ -15,7 +14,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -30,12 +28,10 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "cerebras": [ "aiosqlite", - "autoevals", "blobfile", "cerebras_cloud_sdk", "chardet", @@ -48,7 +44,6 @@ "matplotlib", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -63,14 +58,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "ci-tests": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -83,7 +76,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -99,7 +91,6 @@ "sqlite-vec", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -107,7 +98,6 @@ "dell": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -120,7 +110,6 @@ "matplotlib", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -135,14 +124,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "dev": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -156,7 +143,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -172,14 +158,12 @@ "sqlite-vec", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "fireworks": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -193,7 +177,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -208,14 +191,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "groq": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "datasets", @@ -227,7 +208,6 @@ "matplotlib", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -242,13 +222,11 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "hf-endpoint": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -262,7 +240,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -277,13 +254,11 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "hf-serverless": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -297,7 +272,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -312,7 +286,6 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -320,7 +293,6 @@ "meta-reference-gpu": [ "accelerate", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -335,7 +307,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -353,14 +324,12 @@ "torchvision", "tqdm", "transformers", - "tree_sitter", "uvicorn", "zmq" ], "meta-reference-quantized-gpu": [ "accelerate", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -376,7 +345,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -395,7 +363,6 @@ "torchvision", "tqdm", "transformers", - "tree_sitter", "uvicorn", "zmq" ], @@ -425,13 +392,11 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "ollama": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -445,7 +410,6 @@ "nltk", "numpy", "ollama", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -460,12 +424,10 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "open-benchmark": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -478,7 +440,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -495,12 +456,10 @@ "together", "tqdm", "transformers", - "tree_sitter", "uvicorn" ], "passthrough": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -513,7 +472,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -528,14 +486,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "remote-vllm": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -563,7 +519,6 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" @@ -600,7 +555,6 @@ "tgi": [ "aiohttp", "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -614,7 +568,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -629,14 +582,12 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "together": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -649,7 +600,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -665,14 +615,12 @@ "together", "tqdm", "transformers", - "tree_sitter", "uvicorn", "sentence-transformers --no-deps", "torch torchvision --index-url https://download.pytorch.org/whl/cpu" ], "vllm-gpu": [ "aiosqlite", - "autoevals", "blobfile", "chardet", "chromadb-client", @@ -685,7 +633,6 @@ "mcp", "nltk", "numpy", - "openai", "opentelemetry-exporter-otlp-proto-http", "opentelemetry-sdk", "pandas", @@ -700,7 +647,6 @@ "sentencepiece", "tqdm", "transformers", - "tree_sitter", "uvicorn", "vllm", "sentence-transformers --no-deps", diff --git a/llama_stack/templates/open-benchmark/build.yaml b/llama_stack/templates/open-benchmark/build.yaml index 1db90ef27..b39a17820 100644 --- a/llama_stack/templates/open-benchmark/build.yaml +++ b/llama_stack/templates/open-benchmark/build.yaml @@ -18,15 +18,9 @@ distribution_spec: - inline::meta-reference telemetry: - inline::meta-reference - eval: - - inline::meta-reference datasetio: - remote::huggingface - inline::localfs - scoring: - - inline::basic - - inline::llm-as-judge - - inline::braintrust tool_runtime: - remote::brave-search - remote::tavily-search diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index a3c00af56..c0671cbd1 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -3,10 +3,8 @@ image_name: open-benchmark apis: - agents - datasetio -- eval - inference - safety -- scoring - telemetry - tool_runtime - vector_io @@ -71,14 +69,6 @@ providers: service_name: ${env.OTEL_SERVICE_NAME:llama-stack} sinks: ${env.TELEMETRY_SINKS:console,sqlite} sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/open-benchmark/trace_store.db} - eval: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - kvstore: - type: sqlite - namespace: null - db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/meta_reference_eval.db datasetio: - provider_id: huggingface provider_type: remote::huggingface @@ -94,17 +84,6 @@ providers: type: sqlite namespace: null db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/open-benchmark}/localfs_datasetio.db - scoring: - - provider_id: basic - provider_type: inline::basic - config: {} - - provider_id: llm-as-judge - provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: ${env.OPENAI_API_KEY:} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search @@ -193,33 +172,7 @@ datasets: metadata: {} dataset_id: bfcl provider_id: huggingface -scoring_fns: [] -benchmarks: -- dataset_id: simpleqa - grader_ids: - - llm-as-judge::405b-simpleqa - metadata: {} - benchmark_id: meta-reference-simpleqa -- dataset_id: mmlu_cot - grader_ids: - - basic::regex_parser_multiple_choice_answer - metadata: {} - benchmark_id: meta-reference-mmlu-cot -- dataset_id: gpqa_cot - grader_ids: - - basic::regex_parser_multiple_choice_answer - metadata: {} - benchmark_id: meta-reference-gpqa-cot -- dataset_id: math_500 - grader_ids: - - basic::regex_parser_math_response - metadata: {} - benchmark_id: meta-reference-math-500 -- dataset_id: bfcl - grader_ids: - - basic::bfcl - metadata: {} - benchmark_id: meta-reference-bfcl +benchmarks: [] tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search