forked from phoenix-oss/llama-stack-mirror
feat: [New Eval Benchamark] IfEval (#1708)
# What does this PR do? In this PR, we added a new eval open benchmark IfEval based on paper https://arxiv.org/abs/2311.07911 to measure the model capability of instruction following. ## Test Plan spin up a llama stack server with open-benchmark template run `llama-stack-client --endpoint xxx eval run-benchmark "meta-reference-ifeval" --model-id "meta-llama/Llama-3.3-70B-Instruct" --output-dir "/home/markchen1015/" --num-examples 20` on client side and get the eval aggregate results
This commit is contained in:
parent
a7008dc15d
commit
f369871083
13 changed files with 3520 additions and 1 deletions
1
.github/workflows/integration-tests.yml
vendored
1
.github/workflows/integration-tests.yml
vendored
|
@ -52,6 +52,7 @@ jobs:
|
||||||
# always test against the latest version of the client
|
# always test against the latest version of the client
|
||||||
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main
|
||||||
uv pip install -e .
|
uv pip install -e .
|
||||||
|
llama stack build --template ollama --image-type venv
|
||||||
|
|
||||||
- name: Wait for Ollama to start
|
- name: Wait for Ollama to start
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -7,10 +7,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -23,6 +25,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -41,10 +44,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"nltk",
|
"nltk",
|
||||||
"numpy",
|
"numpy",
|
||||||
|
@ -56,6 +61,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -75,10 +81,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"fireworks-ai",
|
"fireworks-ai",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -91,6 +99,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -112,11 +121,13 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
"huggingface_hub",
|
"huggingface_hub",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"nltk",
|
"nltk",
|
||||||
"numpy",
|
"numpy",
|
||||||
|
@ -128,6 +139,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -147,10 +159,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"fireworks-ai",
|
"fireworks-ai",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"litellm",
|
"litellm",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
|
@ -164,6 +178,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -184,11 +199,13 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"fireworks-ai",
|
"fireworks-ai",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -201,6 +218,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -219,10 +237,12 @@
|
||||||
"blobfile",
|
"blobfile",
|
||||||
"chardet",
|
"chardet",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"litellm",
|
"litellm",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -235,6 +255,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -253,11 +274,13 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
"huggingface_hub",
|
"huggingface_hub",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -270,6 +293,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -288,11 +312,13 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
"huggingface_hub",
|
"huggingface_hub",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -305,6 +331,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -325,11 +352,13 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"fairscale",
|
"fairscale",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"lm-format-enforcer",
|
"lm-format-enforcer",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
|
@ -343,6 +372,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -365,12 +395,14 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"fairscale",
|
"fairscale",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fbgemm-gpu",
|
"fbgemm-gpu",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"lm-format-enforcer",
|
"lm-format-enforcer",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
|
@ -384,6 +416,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -403,10 +436,12 @@
|
||||||
"aiosqlite",
|
"aiosqlite",
|
||||||
"blobfile",
|
"blobfile",
|
||||||
"chardet",
|
"chardet",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"nltk",
|
"nltk",
|
||||||
"numpy",
|
"numpy",
|
||||||
|
@ -418,6 +453,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -436,10 +472,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -453,6 +491,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -470,9 +509,11 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"litellm",
|
"litellm",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
|
@ -486,6 +527,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -505,10 +547,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -521,6 +565,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -540,10 +585,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -556,6 +603,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -605,11 +653,13 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
"huggingface_hub",
|
"huggingface_hub",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -622,6 +672,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -641,10 +692,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -657,6 +710,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
@ -677,10 +731,12 @@
|
||||||
"chardet",
|
"chardet",
|
||||||
"chromadb-client",
|
"chromadb-client",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
"emoji",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
"fastapi",
|
"fastapi",
|
||||||
"fire",
|
"fire",
|
||||||
"httpx",
|
"httpx",
|
||||||
|
"langdetect",
|
||||||
"matplotlib",
|
"matplotlib",
|
||||||
"mcp",
|
"mcp",
|
||||||
"nltk",
|
"nltk",
|
||||||
|
@ -693,6 +749,7 @@
|
||||||
"psycopg2-binary",
|
"psycopg2-binary",
|
||||||
"pymongo",
|
"pymongo",
|
||||||
"pypdf",
|
"pypdf",
|
||||||
|
"pythainlp",
|
||||||
"redis",
|
"redis",
|
||||||
"requests",
|
"requests",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
|
1
docs/_static/llama-stack-spec.html
vendored
1
docs/_static/llama-stack-spec.html
vendored
|
@ -6268,6 +6268,7 @@
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"enum": [
|
"enum": [
|
||||||
"average",
|
"average",
|
||||||
|
"weighted_average",
|
||||||
"median",
|
"median",
|
||||||
"categorical_count",
|
"categorical_count",
|
||||||
"accuracy"
|
"accuracy"
|
||||||
|
|
1
docs/_static/llama-stack-spec.yaml
vendored
1
docs/_static/llama-stack-spec.yaml
vendored
|
@ -4389,6 +4389,7 @@ components:
|
||||||
type: string
|
type: string
|
||||||
enum:
|
enum:
|
||||||
- average
|
- average
|
||||||
|
- weighted_average
|
||||||
- median
|
- median
|
||||||
- categorical_count
|
- categorical_count
|
||||||
- accuracy
|
- accuracy
|
||||||
|
|
|
@ -36,6 +36,7 @@ class ScoringFnParamsType(Enum):
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class AggregationFunctionType(Enum):
|
class AggregationFunctionType(Enum):
|
||||||
average = "average"
|
average = "average"
|
||||||
|
weighted_average = "weighted_average"
|
||||||
median = "median"
|
median = "median"
|
||||||
categorical_count = "categorical_count"
|
categorical_count = "categorical_count"
|
||||||
accuracy = "accuracy"
|
accuracy = "accuracy"
|
||||||
|
|
|
@ -25,6 +25,7 @@ from .config import BasicScoringConfig
|
||||||
from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
|
from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn
|
||||||
from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
|
from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
|
||||||
from .scoring_fn.equality_scoring_fn import EqualityScoringFn
|
from .scoring_fn.equality_scoring_fn import EqualityScoringFn
|
||||||
|
from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
|
||||||
from .scoring_fn.regex_parser_math_response_scoring_fn import (
|
from .scoring_fn.regex_parser_math_response_scoring_fn import (
|
||||||
RegexParserMathResponseScoringFn,
|
RegexParserMathResponseScoringFn,
|
||||||
)
|
)
|
||||||
|
@ -37,6 +38,7 @@ FIXED_FNS = [
|
||||||
RegexParserScoringFn,
|
RegexParserScoringFn,
|
||||||
RegexParserMathResponseScoringFn,
|
RegexParserMathResponseScoringFn,
|
||||||
BFCLScoringFn,
|
BFCLScoringFn,
|
||||||
|
IfEvalScoringFn,
|
||||||
DocVQAScoringFn,
|
DocVQAScoringFn,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from llama_stack.apis.common.type_system import NumberType
|
||||||
|
from llama_stack.apis.scoring_functions import (
|
||||||
|
AggregationFunctionType,
|
||||||
|
BasicScoringFnParams,
|
||||||
|
ScoringFn,
|
||||||
|
)
|
||||||
|
|
||||||
|
ifeval = ScoringFn(
|
||||||
|
identifier="basic::ifeval",
|
||||||
|
description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
|
||||||
|
return_type=NumberType(),
|
||||||
|
provider_id="basic",
|
||||||
|
provider_resource_id="ifeval",
|
||||||
|
params=BasicScoringFnParams(
|
||||||
|
aggregation_functions=[AggregationFunctionType.weighted_average],
|
||||||
|
),
|
||||||
|
)
|
|
@ -0,0 +1,79 @@
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from llama_stack.apis.scoring import ScoringResultRow
|
||||||
|
from llama_stack.apis.scoring_functions import ScoringFnParams
|
||||||
|
from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
|
||||||
|
|
||||||
|
from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
|
||||||
|
from .fn_defs.ifeval import (
|
||||||
|
ifeval,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class IfEvalScoringFn(RegisteredBaseScoringFn):
|
||||||
|
"""
|
||||||
|
A scoring_fn Instruction-Following Eval (IFEval) benchmark
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs) -> None:
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.supported_fn_defs_registry = {
|
||||||
|
ifeval.identifier: ifeval,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def score_row(
|
||||||
|
self,
|
||||||
|
input_row: Dict[str, Any],
|
||||||
|
scoring_fn_identifier: Optional[str] = None,
|
||||||
|
scoring_params: Optional[ScoringFnParams] = None,
|
||||||
|
) -> ScoringResultRow:
|
||||||
|
assert scoring_fn_identifier is not None, "Scoring function identifier not found."
|
||||||
|
fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
|
||||||
|
if scoring_params is not None:
|
||||||
|
fn_def.params = scoring_params
|
||||||
|
|
||||||
|
instruction_list = input_row["instruction_id_list"]
|
||||||
|
generated_answer = input_row["generated_answer"].strip()
|
||||||
|
|
||||||
|
is_following_list = []
|
||||||
|
results = dict(
|
||||||
|
{k + "_correct": 0.0 for k in INSTRUCTION_LIST},
|
||||||
|
**{k + "_total": 0.0 for k in INSTRUCTION_LIST},
|
||||||
|
)
|
||||||
|
|
||||||
|
for index, instruction_id in enumerate(instruction_list):
|
||||||
|
instruction_cls = INSTRUCTION_DICT[instruction_id]
|
||||||
|
instruction = instruction_cls(instruction_id)
|
||||||
|
results[instruction_id + "_total"] += 1.0
|
||||||
|
results[instruction_id.split(":")[0] + "_total"] += 1.0
|
||||||
|
|
||||||
|
clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None}
|
||||||
|
print(clean_input_row)
|
||||||
|
instruction.build_description(**clean_input_row)
|
||||||
|
args = instruction.get_instruction_args()
|
||||||
|
if args and "prompt" in args:
|
||||||
|
instruction.build_description(prompt=input_row["prompt"])
|
||||||
|
|
||||||
|
if generated_answer and instruction.check_following(generated_answer):
|
||||||
|
is_following_list.append(True)
|
||||||
|
results[instruction_id + "_correct"] += 1.0
|
||||||
|
results[instruction_id.split(":")[0] + "_correct"] += 1.0
|
||||||
|
else:
|
||||||
|
is_following_list.append(False)
|
||||||
|
|
||||||
|
if len(is_following_list) == 0:
|
||||||
|
return {
|
||||||
|
"score": 0.0,
|
||||||
|
"weight": 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"score": float(sum(is_following_list)) / float(len(is_following_list)),
|
||||||
|
"weight": float(len(is_following_list)),
|
||||||
|
}
|
3319
llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
Normal file
3319
llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]:
|
||||||
InlineProviderSpec(
|
InlineProviderSpec(
|
||||||
api=Api.eval,
|
api=Api.eval,
|
||||||
provider_type="inline::meta-reference",
|
provider_type="inline::meta-reference",
|
||||||
pip_packages=["tree_sitter"],
|
pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk"],
|
||||||
module="llama_stack.providers.inline.eval.meta_reference",
|
module="llama_stack.providers.inline.eval.meta_reference",
|
||||||
config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
|
config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig",
|
||||||
api_dependencies=[
|
api_dependencies=[
|
||||||
|
|
|
@ -28,6 +28,17 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"weighted_average": sum(
|
||||||
|
result["score"] * result["weight"]
|
||||||
|
for result in scoring_results
|
||||||
|
if result["score"] is not None and result["weight"] is not None
|
||||||
|
)
|
||||||
|
/ sum(result["weight"] for result in scoring_results if result["weight"] is not None),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def aggregate_categorical_count(
|
def aggregate_categorical_count(
|
||||||
scoring_results: List[ScoringResultRow],
|
scoring_results: List[ScoringResultRow],
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
@ -46,6 +57,7 @@ def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]:
|
||||||
AGGREGATION_FUNCTIONS = {
|
AGGREGATION_FUNCTIONS = {
|
||||||
AggregationFunctionType.accuracy: aggregate_accuracy,
|
AggregationFunctionType.accuracy: aggregate_accuracy,
|
||||||
AggregationFunctionType.average: aggregate_average,
|
AggregationFunctionType.average: aggregate_average,
|
||||||
|
AggregationFunctionType.weighted_average: aggregate_weighted_average,
|
||||||
AggregationFunctionType.categorical_count: aggregate_categorical_count,
|
AggregationFunctionType.categorical_count: aggregate_categorical_count,
|
||||||
AggregationFunctionType.median: aggregate_median,
|
AggregationFunctionType.median: aggregate_median,
|
||||||
}
|
}
|
||||||
|
|
|
@ -203,6 +203,13 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
|
uri="huggingface://datasets/llamastack/bfcl_v3?split=train",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
|
DatasetInput(
|
||||||
|
dataset_id="ifeval",
|
||||||
|
purpose=DatasetPurpose.eval_messages_answer,
|
||||||
|
source=URIDataSource(
|
||||||
|
uri="huggingface://datasets/llamastack/IfEval?split=train",
|
||||||
|
),
|
||||||
|
),
|
||||||
DatasetInput(
|
DatasetInput(
|
||||||
dataset_id="docvqa",
|
dataset_id="docvqa",
|
||||||
purpose=DatasetPurpose.eval_messages_answer,
|
purpose=DatasetPurpose.eval_messages_answer,
|
||||||
|
@ -238,6 +245,11 @@ def get_distribution_template() -> DistributionTemplate:
|
||||||
dataset_id="bfcl",
|
dataset_id="bfcl",
|
||||||
scoring_functions=["basic::bfcl"],
|
scoring_functions=["basic::bfcl"],
|
||||||
),
|
),
|
||||||
|
BenchmarkInput(
|
||||||
|
benchmark_id="meta-reference-ifeval",
|
||||||
|
dataset_id="ifeval",
|
||||||
|
scoring_functions=["basic::ifeval"],
|
||||||
|
),
|
||||||
BenchmarkInput(
|
BenchmarkInput(
|
||||||
benchmark_id="meta-reference-docvqa",
|
benchmark_id="meta-reference-docvqa",
|
||||||
dataset_id="docvqa",
|
dataset_id="docvqa",
|
||||||
|
|
|
@ -188,6 +188,12 @@ datasets:
|
||||||
uri: huggingface://datasets/llamastack/bfcl_v3?split=train
|
uri: huggingface://datasets/llamastack/bfcl_v3?split=train
|
||||||
metadata: {}
|
metadata: {}
|
||||||
dataset_id: bfcl
|
dataset_id: bfcl
|
||||||
|
- purpose: eval/messages-answer
|
||||||
|
source:
|
||||||
|
type: uri
|
||||||
|
uri: huggingface://datasets/llamastack/IfEval?split=train
|
||||||
|
metadata: {}
|
||||||
|
dataset_id: ifeval
|
||||||
- purpose: eval/messages-answer
|
- purpose: eval/messages-answer
|
||||||
source:
|
source:
|
||||||
type: uri
|
type: uri
|
||||||
|
@ -221,6 +227,11 @@ benchmarks:
|
||||||
- basic::bfcl
|
- basic::bfcl
|
||||||
metadata: {}
|
metadata: {}
|
||||||
benchmark_id: meta-reference-bfcl
|
benchmark_id: meta-reference-bfcl
|
||||||
|
- dataset_id: ifeval
|
||||||
|
scoring_functions:
|
||||||
|
- basic::ifeval
|
||||||
|
metadata: {}
|
||||||
|
benchmark_id: meta-reference-ifeval
|
||||||
- dataset_id: docvqa
|
- dataset_id: docvqa
|
||||||
scoring_functions:
|
scoring_functions:
|
||||||
- basic::docvqa
|
- basic::docvqa
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue