From 0248f3590a58e9ee4e7b820a85c4a54a8273d77d Mon Sep 17 00:00:00 2001 From: Botao Chen Date: Tue, 18 Mar 2025 23:38:45 -0700 Subject: [PATCH] temp commit --- distributions/dependencies.json | 57 +++++++++++++++++++ .../providers/inline/scoring/basic/scoring.py | 2 + .../open-benchmark/open_benchmark.py | 13 +++++ llama_stack/templates/open-benchmark/run.yaml | 12 ++++ 4 files changed, 84 insertions(+) diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 33b497a33..da0de2820 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -7,10 +7,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -23,6 +25,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -41,10 +44,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "nltk", "numpy", @@ -56,6 +61,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -75,10 +81,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fastapi", "fire", "fireworks-ai", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -91,6 +99,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -112,11 +121,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "nltk", "numpy", @@ -128,6 +139,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -147,10 +159,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fastapi", "fire", "fireworks-ai", "httpx", + "langdetect", "litellm", "matplotlib", "mcp", @@ -164,6 +178,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -184,11 +199,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "fireworks-ai", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -201,6 +218,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -219,10 +237,12 @@ "blobfile", "chardet", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "litellm", "matplotlib", "nltk", @@ -235,6 +255,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -253,11 +274,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "mcp", "nltk", @@ -270,6 +293,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -288,11 +312,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "mcp", "nltk", @@ -305,6 +331,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -325,11 +352,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fairscale", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "lm-format-enforcer", "matplotlib", "mcp", @@ -343,6 +372,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -365,12 +395,14 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fairscale", "faiss-cpu", "fastapi", "fbgemm-gpu", "fire", "httpx", + "langdetect", "lm-format-enforcer", "matplotlib", "mcp", @@ -384,6 +416,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -403,10 +436,12 @@ "aiosqlite", "blobfile", "chardet", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "nltk", "numpy", @@ -418,6 +453,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -436,10 +472,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -453,6 +491,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -470,9 +509,11 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fastapi", "fire", "httpx", + "langdetect", "litellm", "matplotlib", "mcp", @@ -486,6 +527,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -505,10 +547,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -521,6 +565,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -540,10 +585,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -556,6 +603,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -605,11 +653,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "mcp", "nltk", @@ -622,6 +672,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -641,10 +692,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -657,6 +710,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -677,10 +731,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -693,6 +749,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py index a735166e1..9434dd8ce 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring.py +++ b/llama_stack/providers/inline/scoring/basic/scoring.py @@ -24,6 +24,7 @@ from llama_stack.providers.utils.common.data_schema_validator import ( from .config import BasicScoringConfig from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn from .scoring_fn.equality_scoring_fn import EqualityScoringFn +from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn from .scoring_fn.regex_parser_math_response_scoring_fn import ( RegexParserMathResponseScoringFn, ) @@ -36,6 +37,7 @@ FIXED_FNS = [ RegexParserScoringFn, RegexParserMathResponseScoringFn, BFCLScoringFn, + IfEvalScoringFn, ] diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index b339e8c80..e29b987ee 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -208,6 +208,14 @@ def get_distribution_template() -> DistributionTemplate: uri="huggingface://datasets/llamastack/bfcl_v3?split=train", ), ), + DatasetInput( + dataset_id="IfEval", + provider_id="huggingface", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/IfEval?split=train", + ), + ), ] default_benchmarks = [ @@ -236,6 +244,11 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="bfcl", scoring_functions=["basic::bfcl"], ), + BenchmarkInput( + benchmark_id="meta-reference-IfEval", + dataset_id="IfEval", + scoring_functions=["basic::IfEval"], + ), ] return DistributionTemplate( name=name, diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 93f437273..a9d082d53 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -193,6 +193,13 @@ datasets: metadata: {} dataset_id: bfcl provider_id: huggingface +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/IfEval?split=train + metadata: {} + dataset_id: IfEval + provider_id: huggingface scoring_fns: [] benchmarks: - dataset_id: simpleqa @@ -220,6 +227,11 @@ benchmarks: - basic::bfcl metadata: {} benchmark_id: meta-reference-bfcl +- dataset_id: IfEval + scoring_functions: + - basic::IfEval + metadata: {} + benchmark_id: meta-reference-IfEval tool_groups: - toolgroup_id: builtin::websearch provider_id: tavily-search