diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 0af46e1f0..475b26d0a 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -52,6 +52,7 @@ jobs: # always test against the latest version of the client uv pip install git+https://github.com/meta-llama/llama-stack-client-python.git@main uv pip install -e . + llama stack build --template ollama --image-type venv - name: Wait for Ollama to start run: | diff --git a/distributions/dependencies.json b/distributions/dependencies.json index 33b497a33..da0de2820 100644 --- a/distributions/dependencies.json +++ b/distributions/dependencies.json @@ -7,10 +7,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -23,6 +25,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -41,10 +44,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "nltk", "numpy", @@ -56,6 +61,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -75,10 +81,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fastapi", "fire", "fireworks-ai", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -91,6 +99,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -112,11 +121,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "nltk", "numpy", @@ -128,6 +139,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -147,10 +159,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fastapi", "fire", "fireworks-ai", "httpx", + "langdetect", "litellm", "matplotlib", "mcp", @@ -164,6 +178,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -184,11 +199,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "fireworks-ai", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -201,6 +218,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -219,10 +237,12 @@ "blobfile", "chardet", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "litellm", "matplotlib", "nltk", @@ -235,6 +255,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -253,11 +274,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "mcp", "nltk", @@ -270,6 +293,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -288,11 +312,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "mcp", "nltk", @@ -305,6 +331,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -325,11 +352,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fairscale", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "lm-format-enforcer", "matplotlib", "mcp", @@ -343,6 +372,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -365,12 +395,14 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fairscale", "faiss-cpu", "fastapi", "fbgemm-gpu", "fire", "httpx", + "langdetect", "lm-format-enforcer", "matplotlib", "mcp", @@ -384,6 +416,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -403,10 +436,12 @@ "aiosqlite", "blobfile", "chardet", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "nltk", "numpy", @@ -418,6 +453,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -436,10 +472,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -453,6 +491,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -470,9 +509,11 @@ "chardet", "chromadb-client", "datasets", + "emoji", "fastapi", "fire", "httpx", + "langdetect", "litellm", "matplotlib", "mcp", @@ -486,6 +527,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -505,10 +547,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -521,6 +565,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -540,10 +585,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -556,6 +603,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -605,11 +653,13 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", "huggingface_hub", + "langdetect", "matplotlib", "mcp", "nltk", @@ -622,6 +672,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -641,10 +692,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -657,6 +710,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", @@ -677,10 +731,12 @@ "chardet", "chromadb-client", "datasets", + "emoji", "faiss-cpu", "fastapi", "fire", "httpx", + "langdetect", "matplotlib", "mcp", "nltk", @@ -693,6 +749,7 @@ "psycopg2-binary", "pymongo", "pypdf", + "pythainlp", "redis", "requests", "scikit-learn", diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html index eb626fc44..3e3ca723f 100644 --- a/docs/_static/llama-stack-spec.html +++ b/docs/_static/llama-stack-spec.html @@ -6268,6 +6268,7 @@ "type": "string", "enum": [ "average", + "weighted_average", "median", "categorical_count", "accuracy" diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml index fa6920381..6261e9987 100644 --- a/docs/_static/llama-stack-spec.yaml +++ b/docs/_static/llama-stack-spec.yaml @@ -4389,6 +4389,7 @@ components: type: string enum: - average + - weighted_average - median - categorical_count - accuracy diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py index b02a7a0c4..57761c940 100644 --- a/llama_stack/apis/scoring_functions/scoring_functions.py +++ b/llama_stack/apis/scoring_functions/scoring_functions.py @@ -36,6 +36,7 @@ class ScoringFnParamsType(Enum): @json_schema_type class AggregationFunctionType(Enum): average = "average" + weighted_average = "weighted_average" median = "median" categorical_count = "categorical_count" accuracy = "accuracy" diff --git a/llama_stack/providers/inline/scoring/basic/scoring.py b/llama_stack/providers/inline/scoring/basic/scoring.py index 095d46cf5..9a45f7139 100644 --- a/llama_stack/providers/inline/scoring/basic/scoring.py +++ b/llama_stack/providers/inline/scoring/basic/scoring.py @@ -25,6 +25,7 @@ from .config import BasicScoringConfig from .scoring_fn.bfcl_scoring_fn import BFCLScoringFn from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn from .scoring_fn.equality_scoring_fn import EqualityScoringFn +from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn from .scoring_fn.regex_parser_math_response_scoring_fn import ( RegexParserMathResponseScoringFn, ) @@ -37,6 +38,7 @@ FIXED_FNS = [ RegexParserScoringFn, RegexParserMathResponseScoringFn, BFCLScoringFn, + IfEvalScoringFn, DocVQAScoringFn, ] diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py new file mode 100644 index 000000000..adca0791d --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from llama_stack.apis.common.type_system import NumberType +from llama_stack.apis.scoring_functions import ( + AggregationFunctionType, + BasicScoringFnParams, + ScoringFn, +) + +ifeval = ScoringFn( + identifier="basic::ifeval", + description="Eval intruction follow capacity by checkping how many instructions can be followed in each example", + return_type=NumberType(), + provider_id="basic", + provider_resource_id="ifeval", + params=BasicScoringFnParams( + aggregation_functions=[AggregationFunctionType.weighted_average], + ), +) diff --git a/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py new file mode 100644 index 000000000..f06333795 --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py @@ -0,0 +1,79 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from typing import Any, Dict, Optional + +from llama_stack.apis.scoring import ScoringResultRow +from llama_stack.apis.scoring_functions import ScoringFnParams +from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn + +from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST +from .fn_defs.ifeval import ( + ifeval, +) + + +class IfEvalScoringFn(RegisteredBaseScoringFn): + """ + A scoring_fn Instruction-Following Eval (IFEval) benchmark + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.supported_fn_defs_registry = { + ifeval.identifier: ifeval, + } + + async def score_row( + self, + input_row: Dict[str, Any], + scoring_fn_identifier: Optional[str] = None, + scoring_params: Optional[ScoringFnParams] = None, + ) -> ScoringResultRow: + assert scoring_fn_identifier is not None, "Scoring function identifier not found." + fn_def = self.supported_fn_defs_registry[scoring_fn_identifier] + if scoring_params is not None: + fn_def.params = scoring_params + + instruction_list = input_row["instruction_id_list"] + generated_answer = input_row["generated_answer"].strip() + + is_following_list = [] + results = dict( + {k + "_correct": 0.0 for k in INSTRUCTION_LIST}, + **{k + "_total": 0.0 for k in INSTRUCTION_LIST}, + ) + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + results[instruction_id + "_total"] += 1.0 + results[instruction_id.split(":")[0] + "_total"] += 1.0 + + clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None} + print(clean_input_row) + instruction.build_description(**clean_input_row) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=input_row["prompt"]) + + if generated_answer and instruction.check_following(generated_answer): + is_following_list.append(True) + results[instruction_id + "_correct"] += 1.0 + results[instruction_id.split(":")[0] + "_correct"] += 1.0 + else: + is_following_list.append(False) + + if len(is_following_list) == 0: + return { + "score": 0.0, + "weight": 0.0, + } + + return { + "score": float(sum(is_following_list)) / float(len(is_following_list)), + "weight": float(len(is_following_list)), + } diff --git a/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py new file mode 100644 index 000000000..28605159f --- /dev/null +++ b/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py @@ -0,0 +1,3319 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +import collections +import functools +import json +import logging +import random +import re +import string +from types import MappingProxyType +from typing import Dict, Iterable, List, Optional, Sequence, Union + +import emoji +import langdetect +import nltk +from pythainlp.tokenize import sent_tokenize as sent_tokenize_thai +from pythainlp.tokenize import word_tokenize as word_tokenize_thai + +logger = logging.getLogger() + +WORD_LIST = [ + "western", + "sentence", + "signal", + "dump", + "spot", + "opposite", + "bottom", + "potato", + "administration", + "working", + "welcome", + "morning", + "good", + "agency", + "primary", + "wish", + "responsibility", + "press", + "problem", + "president", + "steal", + "brush", + "read", + "type", + "beat", + "trainer", + "growth", + "lock", + "bone", + "case", + "equal", + "comfortable", + "region", + "replacement", + "performance", + "mate", + "walk", + "medicine", + "film", + "thing", + "rock", + "tap", + "total", + "competition", + "ease", + "south", + "establishment", + "gather", + "parking", + "world", + "plenty", + "breath", + "claim", + "alcohol", + "trade", + "dear", + "highlight", + "street", + "matter", + "decision", + "mess", + "agreement", + "studio", + "coach", + "assist", + "brain", + "wing", + "style", + "private", + "top", + "brown", + "leg", + "buy", + "procedure", + "method", + "speed", + "high", + "company", + "valuable", + "pie", + "analyst", + "session", + "pattern", + "district", + "pleasure", + "dinner", + "swimming", + "joke", + "order", + "plate", + "department", + "motor", + "cell", + "spend", + "cabinet", + "difference", + "power", + "examination", + "engine", + "horse", + "dimension", + "pay", + "toe", + "curve", + "literature", + "bother", + "fire", + "possibility", + "debate", + "activity", + "passage", + "hello", + "cycle", + "background", + "quiet", + "author", + "effect", + "actor", + "page", + "bicycle", + "error", + "throat", + "attack", + "character", + "phone", + "tea", + "increase", + "outcome", + "file", + "specific", + "inspector", + "internal", + "potential", + "staff", + "building", + "employer", + "shoe", + "hand", + "direction", + "garden", + "purchase", + "interview", + "study", + "recognition", + "member", + "spiritual", + "oven", + "sandwich", + "weird", + "passenger", + "particular", + "response", + "reaction", + "size", + "variation", + "a", + "cancel", + "candy", + "exit", + "guest", + "condition", + "fly", + "price", + "weakness", + "convert", + "hotel", + "great", + "mouth", + "mind", + "song", + "sugar", + "suspect", + "telephone", + "ear", + "roof", + "paint", + "refrigerator", + "organization", + "jury", + "reward", + "engineering", + "day", + "possession", + "crew", + "bar", + "road", + "description", + "celebration", + "score", + "mark", + "letter", + "shower", + "suggestion", + "sir", + "luck", + "national", + "progress", + "hall", + "stroke", + "theory", + "offer", + "story", + "tax", + "definition", + "history", + "ride", + "medium", + "opening", + "glass", + "elevator", + "stomach", + "question", + "ability", + "leading", + "village", + "computer", + "city", + "grand", + "confidence", + "candle", + "priest", + "recommendation", + "point", + "necessary", + "body", + "desk", + "secret", + "horror", + "noise", + "culture", + "warning", + "water", + "round", + "diet", + "flower", + "bus", + "tough", + "permission", + "week", + "prompt", + "connection", + "abuse", + "height", + "save", + "corner", + "border", + "stress", + "drive", + "stop", + "rip", + "meal", + "listen", + "confusion", + "girlfriend", + "living", + "relation", + "significance", + "plan", + "creative", + "atmosphere", + "blame", + "invite", + "housing", + "paper", + "drink", + "roll", + "silver", + "drunk", + "age", + "damage", + "smoke", + "environment", + "pack", + "savings", + "influence", + "tourist", + "rain", + "post", + "sign", + "grandmother", + "run", + "profit", + "push", + "clerk", + "final", + "wine", + "swim", + "pause", + "stuff", + "singer", + "funeral", + "average", + "source", + "scene", + "tradition", + "personal", + "snow", + "nobody", + "distance", + "sort", + "sensitive", + "animal", + "major", + "negotiation", + "click", + "mood", + "period", + "arrival", + "expression", + "holiday", + "repeat", + "dust", + "closet", + "gold", + "bad", + "sail", + "combination", + "clothes", + "emphasis", + "duty", + "black", + "step", + "school", + "jump", + "document", + "professional", + "lip", + "chemical", + "front", + "wake", + "while", + "inside", + "watch", + "row", + "subject", + "penalty", + "balance", + "possible", + "adult", + "aside", + "sample", + "appeal", + "wedding", + "depth", + "king", + "award", + "wife", + "blow", + "site", + "camp", + "music", + "safe", + "gift", + "fault", + "guess", + "act", + "shame", + "drama", + "capital", + "exam", + "stupid", + "record", + "sound", + "swing", + "novel", + "minimum", + "ratio", + "machine", + "shape", + "lead", + "operation", + "salary", + "cloud", + "affair", + "hit", + "chapter", + "stage", + "quantity", + "access", + "army", + "chain", + "traffic", + "kick", + "analysis", + "airport", + "time", + "vacation", + "philosophy", + "ball", + "chest", + "thanks", + "place", + "mountain", + "advertising", + "red", + "past", + "rent", + "return", + "tour", + "house", + "construction", + "net", + "native", + "war", + "figure", + "fee", + "spray", + "user", + "dirt", + "shot", + "task", + "stick", + "friend", + "software", + "promotion", + "interaction", + "surround", + "block", + "purpose", + "practice", + "conflict", + "routine", + "requirement", + "bonus", + "hole", + "state", + "junior", + "sweet", + "catch", + "tear", + "fold", + "wall", + "editor", + "life", + "position", + "pound", + "respect", + "bathroom", + "coat", + "script", + "job", + "teach", + "birth", + "view", + "resolve", + "theme", + "employee", + "doubt", + "market", + "education", + "serve", + "recover", + "tone", + "harm", + "miss", + "union", + "understanding", + "cow", + "river", + "association", + "concept", + "training", + "recipe", + "relationship", + "reserve", + "depression", + "proof", + "hair", + "revenue", + "independent", + "lift", + "assignment", + "temporary", + "amount", + "loss", + "edge", + "track", + "check", + "rope", + "estimate", + "pollution", + "stable", + "message", + "delivery", + "perspective", + "mirror", + "assistant", + "representative", + "witness", + "nature", + "judge", + "fruit", + "tip", + "devil", + "town", + "emergency", + "upper", + "drop", + "stay", + "human", + "neck", + "speaker", + "network", + "sing", + "resist", + "league", + "trip", + "signature", + "lawyer", + "importance", + "gas", + "choice", + "engineer", + "success", + "part", + "external", + "worker", + "simple", + "quarter", + "student", + "heart", + "pass", + "spite", + "shift", + "rough", + "lady", + "grass", + "community", + "garage", + "youth", + "standard", + "skirt", + "promise", + "blind", + "television", + "disease", + "commission", + "positive", + "energy", + "calm", + "presence", + "tune", + "basis", + "preference", + "head", + "common", + "cut", + "somewhere", + "presentation", + "current", + "thought", + "revolution", + "effort", + "master", + "implement", + "republic", + "floor", + "principle", + "stranger", + "shoulder", + "grade", + "button", + "tennis", + "police", + "collection", + "account", + "register", + "glove", + "divide", + "professor", + "chair", + "priority", + "combine", + "peace", + "extension", + "maybe", + "evening", + "frame", + "sister", + "wave", + "code", + "application", + "mouse", + "match", + "counter", + "bottle", + "half", + "cheek", + "resolution", + "back", + "knowledge", + "make", + "discussion", + "screw", + "length", + "accident", + "battle", + "dress", + "knee", + "log", + "package", + "it", + "turn", + "hearing", + "newspaper", + "layer", + "wealth", + "profile", + "imagination", + "answer", + "weekend", + "teacher", + "appearance", + "meet", + "bike", + "rise", + "belt", + "crash", + "bowl", + "equivalent", + "support", + "image", + "poem", + "risk", + "excitement", + "remote", + "secretary", + "public", + "produce", + "plane", + "display", + "money", + "sand", + "situation", + "punch", + "customer", + "title", + "shake", + "mortgage", + "option", + "number", + "pop", + "window", + "extent", + "nothing", + "experience", + "opinion", + "departure", + "dance", + "indication", + "boy", + "material", + "band", + "leader", + "sun", + "beautiful", + "muscle", + "farmer", + "variety", + "fat", + "handle", + "director", + "opportunity", + "calendar", + "outside", + "pace", + "bath", + "fish", + "consequence", + "put", + "owner", + "go", + "doctor", + "information", + "share", + "hurt", + "protection", + "career", + "finance", + "force", + "golf", + "garbage", + "aspect", + "kid", + "food", + "boot", + "milk", + "respond", + "objective", + "reality", + "raw", + "ring", + "mall", + "one", + "impact", + "area", + "news", + "international", + "series", + "impress", + "mother", + "shelter", + "strike", + "loan", + "month", + "seat", + "anything", + "entertainment", + "familiar", + "clue", + "year", + "glad", + "supermarket", + "natural", + "god", + "cost", + "conversation", + "tie", + "ruin", + "comfort", + "earth", + "storm", + "percentage", + "assistance", + "budget", + "strength", + "beginning", + "sleep", + "other", + "young", + "unit", + "fill", + "store", + "desire", + "hide", + "value", + "cup", + "maintenance", + "nurse", + "function", + "tower", + "role", + "class", + "camera", + "database", + "panic", + "nation", + "basket", + "ice", + "art", + "spirit", + "chart", + "exchange", + "feedback", + "statement", + "reputation", + "search", + "hunt", + "exercise", + "nasty", + "notice", + "male", + "yard", + "annual", + "collar", + "date", + "platform", + "plant", + "fortune", + "passion", + "friendship", + "spread", + "cancer", + "ticket", + "attitude", + "island", + "active", + "object", + "service", + "buyer", + "bite", + "card", + "face", + "steak", + "proposal", + "patient", + "heat", + "rule", + "resident", + "broad", + "politics", + "west", + "knife", + "expert", + "girl", + "design", + "salt", + "baseball", + "grab", + "inspection", + "cousin", + "couple", + "magazine", + "cook", + "dependent", + "security", + "chicken", + "version", + "currency", + "ladder", + "scheme", + "kitchen", + "employment", + "local", + "attention", + "manager", + "fact", + "cover", + "sad", + "guard", + "relative", + "county", + "rate", + "lunch", + "program", + "initiative", + "gear", + "bridge", + "breast", + "talk", + "dish", + "guarantee", + "beer", + "vehicle", + "reception", + "woman", + "substance", + "copy", + "lecture", + "advantage", + "park", + "cold", + "death", + "mix", + "hold", + "scale", + "tomorrow", + "blood", + "request", + "green", + "cookie", + "church", + "strip", + "forever", + "beyond", + "debt", + "tackle", + "wash", + "following", + "feel", + "maximum", + "sector", + "sea", + "property", + "economics", + "menu", + "bench", + "try", + "language", + "start", + "call", + "solid", + "address", + "income", + "foot", + "senior", + "honey", + "few", + "mixture", + "cash", + "grocery", + "link", + "map", + "form", + "factor", + "pot", + "model", + "writer", + "farm", + "winter", + "skill", + "anywhere", + "birthday", + "policy", + "release", + "husband", + "lab", + "hurry", + "mail", + "equipment", + "sink", + "pair", + "driver", + "consideration", + "leather", + "skin", + "blue", + "boat", + "sale", + "brick", + "two", + "feed", + "square", + "dot", + "rush", + "dream", + "location", + "afternoon", + "manufacturer", + "control", + "occasion", + "trouble", + "introduction", + "advice", + "bet", + "eat", + "kill", + "category", + "manner", + "office", + "estate", + "pride", + "awareness", + "slip", + "crack", + "client", + "nail", + "shoot", + "membership", + "soft", + "anybody", + "web", + "official", + "individual", + "pizza", + "interest", + "bag", + "spell", + "profession", + "queen", + "deal", + "resource", + "ship", + "guy", + "chocolate", + "joint", + "formal", + "upstairs", + "car", + "resort", + "abroad", + "dealer", + "associate", + "finger", + "surgery", + "comment", + "team", + "detail", + "crazy", + "path", + "tale", + "initial", + "arm", + "radio", + "demand", + "single", + "draw", + "yellow", + "contest", + "piece", + "quote", + "pull", + "commercial", + "shirt", + "contribution", + "cream", + "channel", + "suit", + "discipline", + "instruction", + "concert", + "speech", + "low", + "effective", + "hang", + "scratch", + "industry", + "breakfast", + "lay", + "join", + "metal", + "bedroom", + "minute", + "product", + "rest", + "temperature", + "many", + "give", + "argument", + "print", + "purple", + "laugh", + "health", + "credit", + "investment", + "sell", + "setting", + "lesson", + "egg", + "middle", + "marriage", + "level", + "evidence", + "phrase", + "love", + "self", + "benefit", + "guidance", + "affect", + "you", + "dad", + "anxiety", + "special", + "boyfriend", + "test", + "blank", + "payment", + "soup", + "obligation", + "reply", + "smile", + "deep", + "complaint", + "addition", + "review", + "box", + "towel", + "minor", + "fun", + "soil", + "issue", + "cigarette", + "internet", + "gain", + "tell", + "entry", + "spare", + "incident", + "family", + "refuse", + "branch", + "can", + "pen", + "grandfather", + "constant", + "tank", + "uncle", + "climate", + "ground", + "volume", + "communication", + "kind", + "poet", + "child", + "screen", + "mine", + "quit", + "gene", + "lack", + "charity", + "memory", + "tooth", + "fear", + "mention", + "marketing", + "reveal", + "reason", + "court", + "season", + "freedom", + "land", + "sport", + "audience", + "classroom", + "law", + "hook", + "win", + "carry", + "eye", + "smell", + "distribution", + "research", + "country", + "dare", + "hope", + "whereas", + "stretch", + "library", + "if", + "delay", + "college", + "plastic", + "book", + "present", + "use", + "worry", + "champion", + "goal", + "economy", + "march", + "election", + "reflection", + "midnight", + "slide", + "inflation", + "action", + "challenge", + "guitar", + "coast", + "apple", + "campaign", + "field", + "jacket", + "sense", + "way", + "visual", + "remove", + "weather", + "trash", + "cable", + "regret", + "buddy", + "beach", + "historian", + "courage", + "sympathy", + "truck", + "tension", + "permit", + "nose", + "bed", + "son", + "person", + "base", + "meat", + "usual", + "air", + "meeting", + "worth", + "game", + "independence", + "physical", + "brief", + "play", + "raise", + "board", + "she", + "key", + "writing", + "pick", + "command", + "party", + "yesterday", + "spring", + "candidate", + "physics", + "university", + "concern", + "development", + "change", + "string", + "target", + "instance", + "room", + "bitter", + "bird", + "football", + "normal", + "split", + "impression", + "wood", + "long", + "meaning", + "stock", + "cap", + "leadership", + "media", + "ambition", + "fishing", + "essay", + "salad", + "repair", + "today", + "designer", + "night", + "bank", + "drawing", + "inevitable", + "phase", + "vast", + "chip", + "anger", + "switch", + "cry", + "twist", + "personality", + "attempt", + "storage", + "being", + "preparation", + "bat", + "selection", + "white", + "technology", + "contract", + "side", + "section", + "station", + "till", + "structure", + "tongue", + "taste", + "truth", + "difficulty", + "group", + "limit", + "main", + "move", + "feeling", + "light", + "example", + "mission", + "might", + "wait", + "wheel", + "shop", + "host", + "classic", + "alternative", + "cause", + "agent", + "consist", + "table", + "airline", + "text", + "pool", + "craft", + "range", + "fuel", + "tool", + "partner", + "load", + "entrance", + "deposit", + "hate", + "article", + "video", + "summer", + "feature", + "extreme", + "mobile", + "hospital", + "flight", + "fall", + "pension", + "piano", + "fail", + "result", + "rub", + "gap", + "system", + "report", + "suck", + "ordinary", + "wind", + "nerve", + "ask", + "shine", + "note", + "line", + "mom", + "perception", + "brother", + "reference", + "bend", + "charge", + "treat", + "trick", + "term", + "homework", + "bake", + "bid", + "status", + "project", + "strategy", + "orange", + "let", + "enthusiasm", + "parent", + "concentrate", + "device", + "travel", + "poetry", + "business", + "society", + "kiss", + "end", + "vegetable", + "employ", + "schedule", + "hour", + "brave", + "focus", + "process", + "movie", + "illegal", + "general", + "coffee", + "ad", + "highway", + "chemistry", + "psychology", + "hire", + "bell", + "conference", + "relief", + "show", + "neat", + "funny", + "weight", + "quality", + "club", + "daughter", + "zone", + "touch", + "tonight", + "shock", + "burn", + "excuse", + "name", + "survey", + "landscape", + "advance", + "satisfaction", + "bread", + "disaster", + "item", + "hat", + "prior", + "shopping", + "visit", + "east", + "photo", + "home", + "idea", + "father", + "comparison", + "cat", + "pipe", + "winner", + "count", + "lake", + "fight", + "prize", + "foundation", + "dog", + "keep", + "ideal", + "fan", + "struggle", + "peak", + "safety", + "solution", + "hell", + "conclusion", + "population", + "strain", + "alarm", + "measurement", + "second", + "train", + "race", + "due", + "insurance", + "boss", + "tree", + "monitor", + "sick", + "course", + "drag", + "appointment", + "slice", + "still", + "care", + "patience", + "rich", + "escape", + "emotion", + "royal", + "female", + "childhood", + "government", + "picture", + "will", + "sock", + "big", + "gate", + "oil", + "cross", + "pin", + "improvement", + "championship", + "silly", + "help", + "sky", + "pitch", + "man", + "diamond", + "most", + "transition", + "work", + "science", + "committee", + "moment", + "fix", + "teaching", + "dig", + "specialist", + "complex", + "guide", + "people", + "dead", + "voice", + "original", + "break", + "topic", + "data", + "degree", + "reading", + "recording", + "bunch", + "reach", + "judgment", + "lie", + "regular", + "set", + "painting", + "mode", + "list", + "player", + "bear", + "north", + "wonder", + "carpet", + "heavy", + "officer", + "negative", + "clock", + "unique", + "baby", + "pain", + "assumption", + "disk", + "iron", + "bill", + "drawer", + "look", + "double", + "mistake", + "finish", + "future", + "brilliant", + "contact", + "math", + "rice", + "leave", + "restaurant", + "discount", + "sex", + "virus", + "bit", + "trust", + "event", + "wear", + "juice", + "failure", + "bug", + "context", + "mud", + "whole", + "wrap", + "intention", + "draft", + "pressure", + "cake", + "dark", + "explanation", + "space", + "angle", + "word", + "efficiency", + "management", + "habit", + "star", + "chance", + "finding", + "transportation", + "stand", + "criticism", + "flow", + "door", + "injury", + "insect", + "surprise", + "apartment", +] # pylint: disable=line-too-long + +# ISO 639-1 codes to language names. +LANGUAGE_CODES = MappingProxyType( + { + "en": "English", + "es": "Spanish", + "pt": "Portuguese", + "ar": "Arabic", + "hi": "Hindi", + "fr": "French", + "ru": "Russian", + "de": "German", + "ja": "Japanese", + "it": "Italian", + "bn": "Bengali", + "uk": "Ukrainian", + "th": "Thai", + "ur": "Urdu", + "ta": "Tamil", + "te": "Telugu", + "bg": "Bulgarian", + "ko": "Korean", + "pl": "Polish", + "he": "Hebrew", + "fa": "Persian", + "vi": "Vietnamese", + "ne": "Nepali", + "sw": "Swahili", + "kn": "Kannada", + "mr": "Marathi", + "gu": "Gujarati", + "pa": "Punjabi", + "ml": "Malayalam", + "fi": "Finnish", + } +) + +# Chinese characters +_CHINESE_CHARS_PATTERN = r"[\u4E00-\u9FFF\u3400-\u4DBF]" +# Japanese Hiragana & Katakana +_JAPANESE_CHARS_PATTERN = r"[\u3040-\u309f\u30a0-\u30ff]" +# Korean (Hangul Syllables) +_KOREAN_CHARS_PATTERN = r"[\uAC00-\uD7AF]" +_ALPHABETS = "([A-Za-z])" +_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]" +_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)" +_STARTERS = ( + r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +) +_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +_WEBSITES = "[.](com|net|org|io|gov|edu|me)" +_DIGITS = "([0-9])" +_MULTIPLE_DOTS = r"\.{2,}" + + +# Util functions +def split_into_sentences(text): + """Split the text into sentences. + + Args: + text: A string that consists of more than or equal to one sentences. + + Returns: + A list of strings where each string is a sentence. + """ + text = " " + text + " " + text = text.replace("\n", " ") + text = re.sub(_PREFIXES, "\\1", text) + text = re.sub(_WEBSITES, "\\1", text) + text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1\\2", text) + text = re.sub( + _MULTIPLE_DOTS, + lambda match: "" * len(match.group(0)) + "", + text, + ) + if "Ph.D" in text: + text = text.replace("Ph.D.", "PhD") + text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1 ", text) + text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1 \\2", text) + text = re.sub( + _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]", + "\\1\\2\\3", + text, + ) + text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1\\2", text) + text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1 \\2", text) + text = re.sub(" " + _SUFFIXES + "[.]", " \\1", text) + text = re.sub(" " + _ALPHABETS + "[.]", " \\1", text) + if "”" in text: + text = text.replace(".”", "”.") + if '"' in text: + text = text.replace('."', '".') + if "!" in text: + text = text.replace('!"', '"!') + if "?" in text: + text = text.replace('?"', '"?') + text = text.replace(".", ".") + text = text.replace("?", "?") + text = text.replace("!", "!") + text = text.replace("", ".") + sentences = text.split("") + sentences = [s.strip() for s in sentences] + if sentences and not sentences[-1]: + sentences = sentences[:-1] + return sentences + + +def count_words(text): + """Counts the number of words.""" + tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") + tokens = tokenizer.tokenize(text) + num_words = len(tokens) + return num_words + + +def split_chinese_japanese_hindi(lines: str) -> Iterable[str]: + """ + Split Chinese and Japanese text into sentences. + From https://stackoverflow.com/questions/27441191/splitting-chinese-document-into-sentences + Special question/exclamation marks were added upon inspection of our raw data, + Also supports multiple lines. + The separator for hindi is '।' + """ + for line in lines.splitlines(): + for sent in re.findall( + r"[^!?。\.\!\?\!\?\.\n।]+[!?。\.\!\?\!\?\.\n।]?", + line.strip(), + flags=re.U, + ): + yield sent + + +def count_words_cjk(text: str) -> int: + """Counts the number of words for Chinese and Japanese and Korean. + Can be extended to additional languages. + Source: https://stackoverflow.com/questions/49164507/how-to-count-the-number-of-chinese-korean-and-english-words withadditional modifications + Example: + >In: count_words_cjk('こんにちは、ジェイソンさん、Jason? Nice to meet you☺ ❤') + >Out: 19 + """ + # Non alpha numeric patterns in latin and asian languages. + non_alphanumeric_patterns = ( + r"[\\.\!\?\.\/_,\{\}<>:;$%^&*(+\"\'+——!,。?、`~@#¥……():;《)《》“”()\[\]«»〔〕\-「」]+" + ) + text = re.sub(non_alphanumeric_patterns, "", text) + + emoji_cnt = emoji.emoji_count(text) # count emojis + text = emoji.replace_emoji(text, "") # remove emojis + + foreign_chars_patterns = "|".join([_CHINESE_CHARS_PATTERN, _JAPANESE_CHARS_PATTERN, _KOREAN_CHARS_PATTERN]) + asian_chars = re.findall(foreign_chars_patterns, text) + asian_chars_cnt = len(asian_chars) + non_asian_chars = re.sub(foreign_chars_patterns, " ", text) + non_asian_words_cnt = len(non_asian_chars.split()) + + return non_asian_words_cnt + asian_chars_cnt + emoji_cnt + + +@functools.lru_cache(maxsize=None) +def _get_sentence_tokenizer(): + return nltk.data.load("nltk:tokenizers/punkt/english.pickle") + + +def count_sentences(text): + """Count the number of sentences.""" + tokenizer = _get_sentence_tokenizer() + tokenized_sentences = tokenizer.tokenize(text) + return len(tokenized_sentences) + + +def get_langid(text: str, lid_path: Optional[str] = None) -> str: + line_langs: List[str] = [] + lines = [line.strip() for line in text.split("\n") if len(line.strip()) >= 4] + + for line in lines: + try: + line_langs.append(langdetect.detect(line)) + except langdetect.LangDetectException as e: + logger.info("Unable to detect language for text %s due to %s", line, e) # refex: disable=pytotw.037 + + if len(line_langs) == 0: + return "en" + # select the text language to be the most commonly predicted language of the lines. + return collections.Counter(line_langs).most_common(1)[0][0] + + +def generate_keywords(num_keywords): + """Randomly generates a few keywords.""" + return random.sample(WORD_LIST, k=num_keywords) + + +"""Library of instructions""" +_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]] + +_LANGUAGES = LANGUAGE_CODES + +# The relational operation for comparison. +_COMPARISON_RELATION = ("less than", "at least") + +# The maximum number of sentences. +_MAX_NUM_SENTENCES = 20 + +# The number of placeholders. +_NUM_PLACEHOLDERS = 4 + +# The number of bullet lists. +_NUM_BULLETS = 5 + +# The options of constrained response. +_CONSTRAINED_RESPONSE_OPTIONS = ( + "My answer is yes.", + "My answer is no.", + "My answer is maybe.", +) + +# The options of starter keywords. +_STARTER_OPTIONS = ( + "I would say", + "My answer is", + "I believe", + "In my opinion", + "I think", + "I reckon", + "I feel", + "From my perspective", + "As I see it", + "According to me", + "As far as I'm concerned", + "To my understanding", + "In my view", + "My take on it is", + "As per my perception", +) + +# The options of ending keywords. +# TODO(jeffreyzhou) add more ending options +_ENDING_OPTIONS = ("Any other questions?", "Is there anything else I can help with?") + +# The number of highlighted sections. +_NUM_HIGHLIGHTED_SECTIONS = 4 + +# The section spliter. +_SECTION_SPLITER = ("Section", "SECTION") + +# The number of sections. +_NUM_SECTIONS = 5 + +# The number of paragraphs. +_NUM_PARAGRAPHS = 5 + +# The postscript marker. +_POSTSCRIPT_MARKER = ("P.S.", "P.P.S") + +# The number of keywords. +_NUM_KEYWORDS = 2 + +# The occurrences of a single keyword. +_KEYWORD_FREQUENCY = 3 + +# The occurrences of a single letter. +_LETTER_FREQUENCY = 10 + +# The occurrences of words with all capital letters. +_ALL_CAPITAL_WORD_FREQUENCY = 20 + +# The number of words in the response. +_NUM_WORDS_LOWER_LIMIT = 100 +_NUM_WORDS_UPPER_LIMIT = 500 + + +class Instruction: + """An instruction template.""" + + def __init__(self, instruction_id): + self.id = instruction_id + + def build_description(self, **kwargs): + raise NotImplementedError("`build_description` not implemented.") + + def get_instruction_args(self): + raise NotImplementedError("`get_instruction_args` not implemented.") + + def get_instruction_args_keys(self): + raise NotImplementedError("`get_instruction_args_keys` not implemented.") + + def check_following(self, value): + raise NotImplementedError("`check_following` not implemented.") + + +class ResponseLanguageChecker(Instruction): + """Check the language of the entire response.""" + + def build_description(self, *, language=None): + """Build the instruction description. + + Args: + language: A string representing the expected language of the response. The + language has to comply to the 97 types defined in + `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows + ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes); + for example, `en` for English, `zh` for Chinese, `fr` for French. + + Returns: + A string representing the instruction description. + """ + self._language = language + if self._language is None: + self._language = random.choice(list(_LANGUAGES.keys())) + + self._description_pattern = ( + "Your ENTIRE response should be in {language} language, no other " + "language is allowed." + ) + return self._description_pattern.format(language=_LANGUAGES[self._language]) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"language": self._language} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["language"] + + def check_following(self, value): + """Check if the language of the entire response follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the language of `value` follows instruction; otherwise False. + """ + assert isinstance(value, str) + + try: + return langdetect.detect(value) == self._language + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logger.info("Unable to detect language for text %s due to %s", value, e) # refex: disable=pytotw.037 + return True + + +class NumberOfSentences(Instruction): + """Check the number of sentences.""" + + def build_description(self, *, num_sentences=None, relation=None): + """Build the instruction description. + + Args: + num_sentences: An integer specifying the number of sentences as a + threshold. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of sentences < the threshold; + if 'at least', the actual number of sentences >= the threshold. + + Returns: + A string representing the instruction description. + """ + # The number of sentences as a threshold for comparison. + self._num_sentences_threshold = num_sentences + if self._num_sentences_threshold is None or self._num_sentences_threshold < 0: + self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = "Your response should contain {relation} {num_sentences} sentences." + return self._description_pattern.format( + relation=self._comparison_relation, + num_sentences=self._num_sentences_threshold, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_sentences": self._num_sentences_threshold, + "relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_sentences", "relation"] + + def check_following(self, value): + """Check if the number of sentences follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the response follows the instruction. + + Raise: + ValueError if the string in `instruction_args` is not in + [`less_than`, `at_least`]. + """ + lang = get_langid(value) + if lang == "th": + # Counting Newline also as a new sentence: + num_sentences = sum([len(sent_tokenize_thai(line)) for line in value.splitlines()]) + elif lang in ["zh", "zh-cn", "zh-tw", "ja", "hi"]: + num_sentences = len(list(split_chinese_japanese_hindi(value))) + else: + num_sentences = count_sentences(value) + if self._comparison_relation == _COMPARISON_RELATION[0]: + return num_sentences < self._num_sentences_threshold + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return num_sentences >= self._num_sentences_threshold + + +class PlaceholderChecker(Instruction): + """Check the placeholders in template writing.""" + + def build_description(self, *, num_placeholders=None): + """Build the instruction description. + + Args: + num_placeholders: An integer denoting the minimum number of + placeholders required in the response. + + Returns: + A string representing the instruction description. + """ + self._num_placeholders = num_placeholders + if self._num_placeholders is None or self._num_placeholders < 0: + self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS) + self._description_pattern = ( + "The response must contain at least {num_placeholders} placeholders " + + "represented by square brackets, such as [address]." + ) + return self._description_pattern.format(num_placeholders=self._num_placeholders) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_placeholders": self._num_placeholders} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_placeholders"] + + def check_following(self, value): + """Check if the number of placeholders follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the actual number of placeholders in the response is greater than + or equal to `num_placeholders`; otherwise, False. + """ + placeholders = re.findall(r"\[.*?\]", value) + num_placeholders = len(placeholders) + return num_placeholders >= self._num_placeholders + + +class BulletListChecker(Instruction): + """Checks the bullet list in the prompt.""" + + def build_description(self, *, num_bullets=None): + """Build the instruction description. + + Args: + num_bullets: An integer specifying the exact number of bullet lists + that is required to appear in the response. + + Returns: + A string representing the instruction description. + """ + self._num_bullets = num_bullets + if self._num_bullets is None or self._num_bullets < 0: + self._num_bullets = random.randint(1, _NUM_BULLETS) + self._description_pattern = ( + "Your answer must contain exactly {num_bullets} bullet points. " + + "Use the markdown bullet points such as:\n" + + "* This is point 1. \n" + + "* This is point 2" + ) + return self._description_pattern.format(num_bullets=self._num_bullets) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_bullets": self._num_bullets} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_bullets"] + + def check_following(self, value): + r"""Check if the number of bullet lists meets the requirement. + + Args: + value: A string representing the response. The response is expected to + contain some bullet lists that start with `\*`. + + Returns: + True if the actual number of bullet lists in the response meets the + requirement. + """ + bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE) + bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE) + num_bullet_lists = len(bullet_lists) + len(bullet_lists_2) + return num_bullet_lists == self._num_bullets + + +class ConstrainedResponseChecker(Instruction): + """Checks the constrained response.""" + + def build_description(self): + """Build the instruction description.""" + # A sequence of string(s) representing the options of the expected response. + self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS + self._description_pattern = "Answer with one of the following options: {response_options}" + return self._description_pattern.format(response_options=self._constrained_responses) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response matches the constrained options. + + Args: + value: A string representing the response. + + Returns: + True if the actual response contains one of the options in the constrained + responses; otherwise False. + """ + value = value.strip() + for constrained_response in self._constrained_responses: + if constrained_response in value: + return True + return False + + +class ConstrainedStartChecker(Instruction): + """Checks the response start.""" + + def build_description(self, *, starter=None): + """Build the instruction description. + + Args: + starter: A string representing the keyward that the response should start + with. + + Returns: + A string representing the instruction description. + """ + self._starter = starter.strip() if isinstance(starter, str) else starter + if self._starter is None: + self._starter = random.choice(_STARTER_OPTIONS) + self._description_pattern = ( + "During the conversation, when it is your turn, " + "please always start with {starter}" + ) + return self._description_pattern.format(starter=self._starter) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"starter": self._starter} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["starter"] + + def check_following(self, value): + """Checks if the response starts with the constrained keyword or phrase. + + Args: + value: A string representing the response. + + Returns: + True if the response starts with the given phrase or keyword that is + contained in `instruction_args`; otherwise, False. + """ + response_pattern = r"^\s*" + self._starter + r".*$" + response_with_constrained_start = re.search(response_pattern, value, flags=re.MULTILINE) + return True if response_with_constrained_start else False + + +class HighlightSectionChecker(Instruction): + """Checks the highlighted section.""" + + def build_description(self, *, num_highlights=None): + """Build the instruction description. + + Args: + num_highlights: An integer specifying the minimum number of highlighted + sections. + + Returns: + A string representing the instruction description. + """ + self._num_highlights = num_highlights + if self._num_highlights is None or self._num_highlights < 0: + self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS) + + self._description_pattern = ( + "Highlight at least {num_highlights} sections in your answer with " + + "markdown, i.e. *highlighted section*." + ) + + return self._description_pattern.format(num_highlights=self._num_highlights) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_highlights": self._num_highlights} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_highlights"] + + def check_following(self, value): + """Checks if the number of highlighted sections meets the requirement. + + Args: + value: a string repesenting the response. The response is expected to + contain highlighted sections in the format of *highlighted*. + + Returns: + True if the actual number of highlighted sections in the format of + *highlighed sections* meets the minimum requirement; otherwise False. + """ + num_highlights = 0 + highlights = re.findall(r"\*[^\n\*]*\*", value) + double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value) + for highlight in highlights: + if highlight.strip("*").strip(): + num_highlights += 1 + for highlight in double_highlights: + if highlight.removeprefix("**").removesuffix("**").strip(): + num_highlights += 1 + + return num_highlights >= self._num_highlights + + +class SectionChecker(Instruction): + """Checks the sections.""" + + def build_description(self, *, section_spliter=None, num_sections=None): + """Build the instruction description. + + Args: + section_spliter: A string represents the section spliter keyword that + marks a new section, i.e., `Section` or `SECTION`. + num_sections: An integer specifying the number of sections. + + Returns: + A string representing the instruction description. + """ + self._section_spliter = section_spliter.strip() if isinstance(section_spliter, str) else section_spliter + if self._section_spliter is None: + self._section_spliter = random.choice(_SECTION_SPLITER) + + self._num_sections = num_sections + if self._num_sections is None or self._num_sections < 0: + self._num_sections = random.randint(1, _NUM_SECTIONS) + + self._description_pattern = ( + "Your response must have {num_sections} sections. Mark the beginning " + + "of each section with {section_spliter} X, such as:\n" + + "{section_spliter} 1\n" + + "[content of section 1]\n" + + "{section_spliter} 2\n" + + "[content of section 2]" + ) + + return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "section_spliter": self._section_spliter, + "num_sections": self._num_sections, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["section_spliter", "num_sections"] + + def check_following(self, value): + """Checks the response contains multiple sections. + + Args: + value: A string representing the response. The response is expected + to contain multiple sections (number of sections is greater than 1). + A new section starts with `Section 1`, where the number denotes the + section index. + + Returns: + True if the number of sections in the response is greater than or equal to + the minimum number of sections; otherwise, False. + """ + section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?" + sections = re.split(section_splitter_patten, value) + num_sections = len(sections) - 1 + return num_sections >= self._num_sections + + +class ParagraphChecker(Instruction): + """Checks the paragraphs.""" + + def build_description(self, *, num_paragraphs=None): + """Build the instruction description. + + Args: + num_paragraphs: An integer specifying the number of paragraphs. + + Returns: + A string representing the instruction description. + """ + self._num_paragraphs = num_paragraphs + if self._num_paragraphs is None or self._num_paragraphs < 0: + self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) + + self._description_pattern = ( + "There should be {num_paragraphs} paragraphs. " + "Paragraphs are separated with the markdown divider: ***" + ) + + return self._description_pattern.format(num_paragraphs=self._num_paragraphs) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_paragraphs": self._num_paragraphs} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_paragraphs"] + + def check_following(self, value): + """Checks the response contains required number of paragraphs. + + Args: + value: A string representing the response. The response may contain + paragraphs that are separated by the markdown divider: `***`. + + Returns: + True if the actual number of paragraphs is the same as required; + otherwise, False. + """ + paragraphs = re.split(r"\s?\*\*\*\s?", value) + num_paragraphs = len(paragraphs) + + for index, paragraph in enumerate(paragraphs): + if not paragraph.strip(): + if index == 0 or index == len(paragraphs) - 1: + num_paragraphs -= 1 + else: + return False + + return num_paragraphs == self._num_paragraphs + + +class PostscriptChecker(Instruction): + """Checks the postscript.""" + + def build_description(self, *, postscript_marker=None): + """Build the instruction description. + + Args: + postscript_marker: A string containing the keyword that marks the start + of the postscript section. + + Returns: + A string representing the instruction description. + """ + self._postscript_marker = postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker + if self._postscript_marker is None: + self._postscript_marker = random.choice(_POSTSCRIPT_MARKER) + + self._description_pattern = ( + "At the end of your response, please explicitly add a postscript " + "starting with {postscript}" + ) + + return self._description_pattern.format(postscript=self._postscript_marker) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"postscript_marker": self._postscript_marker} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["postscript_marker"] + + def check_following(self, value): + """Checks if the response follows the postscript format. + + Args: + value: a string representing the response. The response is expected to + contain a postscript section. + + Returns: + True if the response contains a postscript section starting with + the keyword containing in the `instruction_args`; otherwise False. + """ + value = value.lower() + if self._postscript_marker == "P.P.S": + postscript_pattern = r"\s*p\.\s?p\.\s?s.*$" + elif self._postscript_marker == "P.S.": + postscript_pattern = r"\s*p\.\s?s\..*$" + else: + postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$" + postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE) + return True if postscript else False + + +class RephraseChecker(Instruction): + """Checks the repharse.""" + + def build_description(self, *, original_message): + """Build the instruction description. + + Args: + original_message: A string representing the original message. The + rephrased response should only change its words/sentences in between + its two asterisks, for example, *change me*. Both original and rephrased + messages should contain the changes in the form of *change me*. + + Returns: + A string representing the instruction description. + """ + if not self.is_change(original_message): + raise ValueError(f"Message {original_message} does not contain changes in the form of *change me*.") + + self._reference_without_change = original_message + self._description = ( + "Rephrasing: Your rephrased response should only" + + "change the words/sentences in between two asterisks" + + "such as *change me*." + ) + return self._description + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"original_message": self._reference_without_change} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["original_message"] + + def check_following(self, value): + r"""Checks if the rephrasing follows the instruction. + + Args: + value: A string representing the response, which is expected to rephras + the string of `instruction_args`. + + Returns: + True if `value` and `instruction_args` only differ by the words/sentences + in between two asterisks such as *change me*; otherwise, False. + """ + + if not self.is_change(value): + raise ValueError(f"value {value} does not contain changes in the form of *change me*.") + + response_without_changes = self.strip_changes(value) + reference_without_changes = self.strip_changes(self._reference_without_change) + + return response_without_changes == reference_without_changes + + def is_change(self, response): + """Check if there is change in the response in the form of *change me*.""" + return re.search(r"\*.*\*", response) + + def strip_changes(self, response): + """Strips off the changes.""" + return re.sub(r"\*.*\*", "", response) + + +class KeywordChecker(Instruction): + """Check the exisitence of certain keywords.""" + + def build_description(self, *, keywords=None): + """Build the instruction description. + + Args: + keywords: A sequence of strings representing the keywords that are + expected in the response. + + Returns: + A string representing the instruction description. + """ + + if not keywords: + self._keywords = generate_keywords(num_keywords=_NUM_KEYWORDS) + else: + self._keywords = keywords + self._keywords = sorted(self._keywords) + + self._description_pattern = "Include keywords {keywords} in the response." + + return self._description_pattern.format(keywords=self._keywords) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"keywords": self._keywords} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keywords"] + + def check_following(self, value): + """Check if the response contain the expected keywords.""" + for keyword in self._keywords: + if not re.search(keyword, value, flags=re.IGNORECASE): + return False + return True + + +class KeywordFrequencyChecker(Instruction): + """Check the keyword frequency.""" + + def build_description(self, *, keyword=None, frequency=None, relation=None): + """Build the instruction description. + + Args: + keyword: A string representing a keyword that is expected in the response. + frequency: An integer specifying the number of times `keyword` is expected + to appear in the response. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of occurrences < frequency; + if 'at least', the actual number of occurrences >= frequency. + + Returns: + A string representing the instruction description. + """ + if not keyword: + self._keyword = generate_keywords(num_keywords=1)[0] + else: + self._keyword = keyword.strip() + + self._frequency = frequency + if self._frequency is None or self._frequency < 0: + self._frequency = random.randint(1, _KEYWORD_FREQUENCY) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = ( + "In your response, the word {keyword} should appear {relation} " + "{frequency} times." + ) + + return self._description_pattern.format( + keyword=self._keyword, + relation=self._comparison_relation, + frequency=self._frequency, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "keyword": self._keyword, + "frequency": self._frequency, + "relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keyword", "frequency", "relation"] + + def check_following(self, value): + """Checks if the response contain the keyword with required frequency.""" + actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE)) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return actual_occurrences < self._frequency + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return actual_occurrences >= self._frequency + + +class NumberOfWords(Instruction): + """Checks the number of words.""" + + def build_description(self, *, num_words=None, relation=None): + """Build the instruction description. + + Args: + num_words: An integer specifying the number of words contained in the + response. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of words < num_words; + if 'at least', the actual number of words >= num_words. + + Returns: + A string representing the instruction description. + """ + + self._num_words = num_words + if self._num_words is None or self._num_words < 0: + self._num_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = "Answer with {relation} {num_words} words." + + return self._description_pattern.format(relation=self._comparison_relation, num_words=self._num_words) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_words": self._num_words, "relation": self._comparison_relation} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_words", "relation"] + + def check_following(self, value): + """Checks if the response contains the expected number of words.""" + lang = get_langid(value) + if lang == "th": + num_words = len(word_tokenize_thai(value)) + elif lang in ["zh", "zh-cn", "zh-tw", "ja", "ko"]: + num_words = count_words_cjk(value) + else: + num_words = count_words(value) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return num_words < self._num_words + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return num_words >= self._num_words + + +class JsonFormat(Instruction): + """Check the Json format.""" + + def build_description(self): + self._description_pattern = ( + "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + value = ( + value.strip() + .removeprefix("```json") + .removeprefix("```Json") + .removeprefix("```JSON") + .removeprefix("```") + .removesuffix("```") + .strip() + ) + try: + json.loads(value) + except ValueError as _: + return False + return True + + +class ParagraphFirstWordCheck(Instruction): + """Check the paragraph and the first word of the nth paragraph.""" + + def build_description(self, num_paragraphs=None, nth_paragraph=None, first_word=None): + r"""Build the instruction description. + + Args: + num_paragraphs: An integer indicating the number of paragraphs expected + in the response. A paragraph is a subset of the string that is + expected to be separated by '\n\n'. + nth_paragraph: An integer indicating the paragraph number that we look at. + Note that n starts from 1. + first_word: A string that represent the first word of the bth paragraph. + + Returns: + A string representing the instruction description. + """ + self._num_paragraphs = num_paragraphs + if self._num_paragraphs is None or self._num_paragraphs < 0: + self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) + + self._nth_paragraph = nth_paragraph + if self._nth_paragraph is None or self._nth_paragraph <= 0 or self._nth_paragraph > self._num_paragraphs: + self._nth_paragraph = random.randint(1, self._num_paragraphs + 1) + + self._first_word = first_word + if self._first_word is None: + self._first_word = generate_keywords(num_keywords=1)[0] + self._first_word = self._first_word.lower() + + self._description_pattern = ( + "There should be {num_paragraphs} paragraphs. " + + "Paragraphs and only paragraphs are separated with each other by two " + + "new lines as if it was '\\n\\n' in python. " + + "Paragraph {nth_paragraph} must start with word {first_word}." + ) + + return self._description_pattern.format( + num_paragraphs=self._num_paragraphs, + nth_paragraph=self._nth_paragraph, + first_word=self._first_word, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_paragraphs": self._num_paragraphs, + "nth_paragraph": self._nth_paragraph, + "first_word": self._first_word, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_paragraphs", "nth_paragraph", "first_word"] + + def check_following(self, value): + """Checks for required number of paragraphs and correct first word. + + Args: + value: a string representing the response. The response may contain + paragraphs that are separated by two new lines and the first word of + the nth paragraph will have to match a specified word. + + Returns: + True if the number of paragraphs is the same as required and the first + word of the specified paragraph is the same as required. Otherwise, false. + """ + + paragraphs = re.split(r"\n\n", value) + num_paragraphs = len(paragraphs) + + for paragraph in paragraphs: + if not paragraph.strip(): + num_paragraphs -= 1 + + # check that index doesn't go out of bounds + if self._nth_paragraph <= num_paragraphs: + paragraph = paragraphs[self._nth_paragraph - 1].strip() + if not paragraph: + return False + else: + return False + + first_word = "" + punctuation = {".", ",", "?", "!", "'", '"'} + + # get first word and remove punctuation + word = paragraph.split()[0].strip() + word = word.lstrip("'") + word = word.lstrip('"') + + for letter in word: + if letter in punctuation: + break + first_word += letter.lower() + + return num_paragraphs == self._num_paragraphs and first_word == self._first_word + + +class KeySentenceChecker(Instruction): + """Check the existence of certain key sentences.""" + + def build_description(self, key_sentences=None, num_sentences=None): + """Build the instruction description. + + Args: + key_sentences: A sequences of strings representing the key sentences that + are expected in the response. + num_sentences: The number of key sentences that are expected to be seen in + the response. + + Returns: + A string representing the instruction description. + """ + + if not key_sentences: + self._key_sentences = {["For now, this is fine."]} + else: + self._key_sentences = key_sentences + + if not num_sentences: + self._num_sentences = random.randint(1, len(self._key_sentences)) + else: + self._num_sentences = num_sentences + + self._description_pattern = "Include {num_sentences} of the following sentences {key_sentences}" + + return self._description_pattern.format(num_sentences=self._num_sentences, key_sentences=self._key_sentences) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_sentences": self._num_sentences, + "key_sentences": list(self._key_sentences), + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_sentences", "key_sentences"] + + def check_following(self, value): + """Checks if the response contains the expected key sentences.""" + count = 0 + sentences = split_into_sentences(value) + for sentence in self._key_sentences: + if sentence in sentences: + count += 1 + + return count == self._num_sentences + + +class ForbiddenWords(Instruction): + """Checks that specified words are not used in response.""" + + def build_description(self, forbidden_words=None): + """Build the instruction description. + + Args: + forbidden_words: A sequences of strings respresenting words that are not + allowed in the response. + + Returns: + A string representing the instruction description. + """ + + if not forbidden_words: + self._forbidden_words = generate_keywords(num_keywords=_NUM_KEYWORDS) + else: + self._forbidden_words = list(set(forbidden_words)) + self._forbidden_words = sorted(self._forbidden_words) + self._description_pattern = "Do not include keywords {forbidden_words} in the response." + + return self._description_pattern.format(forbidden_words=self._forbidden_words) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"forbidden_words": self._forbidden_words} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["forbidden_words"] + + def check_following(self, value): + """Check if the response does not contain the expected keywords.""" + for word in self._forbidden_words: + if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE): + return False + return True + + +class RephraseParagraph(Instruction): + """Checks that the paragraph is rephrased.""" + + def build_description(self, *, original_paragraph, low, high): + """Builds the instruction description. + + Args: + original_paragraph: A string presenting the original paragraph. The + rephrases response should have betweeb low-high words in common. + low: An integer presenting the lower bound of similar words. + high: An integer representing the upper bound of similar words. + + Returns: + A string representing the instruction description. + """ + self._original_paragraph = original_paragraph + self._low = low + self._high = high + + self._description = ( + "Rephrase the following paragraph: " + + "{original_paragraph}\nYour response should have " + + "between {low} and {high} of the same words. " + + "Words are the same if and only if all of the " + + "letters, ignoring cases, are the same. For " + + "example, 'run' is the same as 'Run' but different " + + "to 'ran'." + ) + + return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "original_paragraph": self._original_paragraph, + "low": self._low, + "high": self._high, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["original_paragraph", "low", "high"] + + def check_following(self, value): + val_words = re.findall(r"\w+", value.lower()) + original_words = re.findall(r"\w+", self._original_paragraph.lower()) + similar_words = 0 + + dict_val = collections.Counter(val_words) + dict_original = collections.Counter(original_words) + + for word in dict_original: + similar_words += min(dict_original[word], dict_val[word]) + + return similar_words >= self._low and similar_words <= self._high + + +class TwoResponsesChecker(Instruction): + """Check that two responses were given.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Give two different responses. Responses and only responses should" + " be separated by 6 asterisk symbols: ******." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response has two different answers. + + Args: + value: A string representing the response. + + Returns: + True if two responses are detected and false otherwise. + """ + valid_responses = list() + responses = value.split("******") + for index, response in enumerate(responses): + if not response.strip(): + if index != 0 and index != len(responses) - 1: + return False + else: + valid_responses.append(response) + return len(valid_responses) == 2 and valid_responses[0].strip() != valid_responses[1].strip() + + +class RepeatPromptThenAnswer(Instruction): + """Checks that Prompt is first repeated then answered.""" + + def build_description(self, *, prompt_to_repeat=None): + """Build the instruction description. + + Args: + prompt_to_repeat: The prompt that is meant to be repeated. + + Returns: + A string representing the instruction description. + """ + if not prompt_to_repeat: + raise ValueError("prompt_to_repeat must be set.") + else: + self._prompt_to_repeat = prompt_to_repeat + self._description_pattern = ( + "First repeat the request word for word without change," + " then give your answer (1. do not say any words or characters" + " before repeating the request; 2. the request you need to repeat" + " does not include this sentence)" + ) + return self._description_pattern + + def get_instruction_args(self): + return {"prompt_to_repeat": self._prompt_to_repeat} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["prompt_to_repeat"] + + def check_following(self, value): + if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()): + return True + return False + + +class EndChecker(Instruction): + """Checks that the prompt ends with a given phrase.""" + + def build_description(self, *, end_phrase=None): + """Build the instruction description. + + Args: + end_phrase: A string representing the phrase the response should end with. + + Returns: + A string representing the instruction description. + """ + self._end_phrase = end_phrase.strip() if isinstance(end_phrase, str) else end_phrase + if self._end_phrase is None: + self._end_phrase = random.choice(_ENDING_OPTIONS) + self._description_pattern = ( + "Finish your response with this exact phrase {ender}. No other words should follow this phrase." + ) + return self._description_pattern.format(ender=self._end_phrase) + + def get_instruction_args(self): + return {"end_phrase": self._end_phrase} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["end_phrase"] + + def check_following(self, value): + """Checks if the response ends with the expected phrase.""" + value = value.strip().strip('"').lower() + self._end_phrase = self._end_phrase.strip().lower() + return value.endswith(self._end_phrase) + + +class TitleChecker(Instruction): + """Checks the response for a title.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Your answer must contain a title, wrapped in double angular brackets, such as <>." + ) + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response contains a title.""" + pattern = r"<<[^\n]+>>" + re_pattern = re.compile(pattern) + titles = re.findall(re_pattern, value) + + for title in titles: + if title.lstrip("<").rstrip(">").strip(): + return True + return False + + +class LetterFrequencyChecker(Instruction): + """Checks letter frequency.""" + + def build_description(self, *, letter=None, let_frequency=None, let_relation=None): + """Build the instruction description. + + Args: + letter: A string representing a letter that is expected in the response. + let_frequency: An integer specifying the number of times `keyword` is + expected to appear in the response. + let_relation: A string in (`less than`, `at least`), defining the + relational operator for comparison. Two relational comparisons are + supported for now; if 'less than', the actual number of + occurrences < frequency; if 'at least', the actual number of + occurrences >= frequency. + + Returns: + A string representing the instruction description. + """ + if not letter or len(letter) > 1 or ord(letter.lower()) < 97 or ord(letter.lower()) > 122: + self._letter = random.choice(list(string.ascii_letters)) + else: + self._letter = letter.strip() + self._letter = self._letter.lower() + + self._frequency = let_frequency + if self._frequency is None or self._frequency < 0: + self._frequency = random.randint(1, _LETTER_FREQUENCY) + + if let_relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif let_relation not in _COMPARISON_RELATION: + raise ValueError( + f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {let_relation} is given." + ) + else: + self._comparison_relation = let_relation + + self._description_pattern = ( + "In your response, the letter {letter} should appear {let_relation} {let_frequency} times." + ) + + return self._description_pattern.format( + letter=self._letter, + let_frequency=self._frequency, + let_relation=self._comparison_relation, + ) + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return { + "letter": self._letter, + "let_frequency": self._frequency, + "let_relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["letter", "let_frequency", "let_relation"] + + def check_following(self, value): + """Checks that the response contains the letter at the right frequency.""" + value = value.lower() + letters = collections.Counter(value) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return letters[self._letter] < self._frequency + else: + return letters[self._letter] >= self._frequency + + +class CapitalLettersEnglishChecker(Instruction): + """Checks that the response is in english and is in all capital letters.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Your entire response should be in English, and in all capital letters." + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response is in English and in all capital letters.""" + assert isinstance(value, str) + + try: + return value.isupper() and langdetect.detect(value) == "en" + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logger.info("Unable to detect language for text %s due to %s", value, e) # refex: disable=pytotw.037 + return True + + +class LowercaseLettersEnglishChecker(Instruction): + """Checks that the response is in english and is in all lowercase letters.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed." + ) + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response is in English and in all lowercase letters.""" + assert isinstance(value, str) + + try: + return value.islower() and langdetect.detect(value) == "en" + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logger.info("Unable to detect language for text %s due to %s", value, e) # refex: disable=pytotw.037 + return True + + +class CommaChecker(Instruction): + """Checks the response for no commas.""" + + def build_description(self, **kwargs): + """Build the instruction description.""" + self._description_pattern = "In your entire response, refrain from the use of any commas." + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response does not contain commas.""" + return not re.search(r"\,", value) + + +class CapitalWordFrequencyChecker(Instruction): + """Checks frequency of words with all capital letters.""" + + def build_description( + self, + capital_frequency=None, + capital_relation=None, + ): + """Build the instruction description. + + Args: + capital_frequency: An integer that represents the number of words that + should be in all capital letters. + capital_relation: A string that is 'at least' or 'at most' that refers to + the frequency. + + Returns: + A string representing the instruction description. + """ + self._frequency = capital_frequency + if self._frequency is None: + self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY) + + self._comparison_relation = capital_relation + if capital_relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif capital_relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {capital_relation} is given." + ) + + self._description_pattern = ( + "In your response, words with all capital letters should appear {relation} {frequency} times." + ) + + return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation) + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return { + "capital_frequency": self._frequency, + "capital_relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["capital_frequency", "capital_relation"] + + def check_following(self, value): + """Checks the frequency of words with all capital letters.""" + # Hyphenated words will count as one word + nltk.download("punkt_tab") + words = nltk.word_tokenize(value) + capital_words = [word for word in words if word.isupper()] + + capital_words = len(capital_words) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return capital_words < self._frequency + else: + return capital_words >= self._frequency + + +class QuotationChecker(Instruction): + """Checks response is wrapped with double quotation marks.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Wrap your entire response with double quotation marks." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response is wrapped with double quotation marks.""" + quotations_map = { + "ja": "「」", + "ru": "«»", + "th": "“”", + "zh": "“”", + "zh-cn": "“”", + "zh-tw": "“”", + } + value = value.strip() + lang = get_langid(value) + quotes = quotations_map.get(lang, '""') + # TODO: We may wanna revisit this logic in new generations to only check of the response language's quotes. + return len(value) > 1 and value[0] in [quotes[0], '"'] and value[-1] in [quotes[1], '"'] + + +# Define instruction dicts +_KEYWORD = "keywords:" +_LANGUAGE = "language:" +_LENGTH = "length_constraints:" +_CONTENT = "detectable_content:" +_FORMAT = "detectable_format:" +_MULTITURN = "multi-turn:" +_COMBINATION = "combination:" +_STARTEND = "startend:" +_CHANGE_CASES = "change_case:" +_PUNCTUATION = "punctuation:" + +INSTRUCTION_DICT = { + _KEYWORD + "existence": KeywordChecker, + _KEYWORD + "frequency": KeywordFrequencyChecker, + # _KEYWORD + "key_sentences": KeySentenceChecker, + _KEYWORD + "forbidden_words": ForbiddenWords, + _KEYWORD + "letter_frequency": LetterFrequencyChecker, + _LANGUAGE + "response_language": ResponseLanguageChecker, + _LENGTH + "number_sentences": NumberOfSentences, + _LENGTH + "number_paragraphs": ParagraphChecker, + _LENGTH + "number_words": NumberOfWords, + _LENGTH + "nth_paragraph_first_word": ParagraphFirstWordCheck, + _CONTENT + "number_placeholders": PlaceholderChecker, + _CONTENT + "postscript": PostscriptChecker, + _FORMAT + "number_bullet_lists": BulletListChecker, + # _CONTENT + "rephrase_paragraph": RephraseParagraph, + _FORMAT + "constrained_response": ConstrainedResponseChecker, + _FORMAT + "number_highlighted_sections": (HighlightSectionChecker), + _FORMAT + "multiple_sections": SectionChecker, + # _FORMAT + "rephrase": RephraseChecker, + _FORMAT + "json_format": JsonFormat, + _FORMAT + "title": TitleChecker, + # _MULTITURN + "constrained_start": ConstrainedStartChecker, + _COMBINATION + "two_responses": TwoResponsesChecker, + _COMBINATION + "repeat_prompt": RepeatPromptThenAnswer, + _STARTEND + "end_checker": EndChecker, + _CHANGE_CASES + "capital_word_frequency": CapitalWordFrequencyChecker, + _CHANGE_CASES + "english_capital": CapitalLettersEnglishChecker, + _CHANGE_CASES + "english_lowercase": LowercaseLettersEnglishChecker, + _PUNCTUATION + "no_comma": CommaChecker, + _STARTEND + "quotation": QuotationChecker, +} + +INSTRUCTION_LIST = list(INSTRUCTION_DICT.keys()) + [ + _KEYWORD[:-1], + _LANGUAGE[:-1], + _LENGTH[:-1], + _CONTENT[:-1], + _FORMAT[:-1], + _MULTITURN[:-1], + _COMBINATION[:-1], + _STARTEND[:-1], + _CHANGE_CASES[:-1], + _PUNCTUATION[:-1], +] diff --git a/llama_stack/providers/registry/eval.py b/llama_stack/providers/registry/eval.py index 755d30382..f3e42c531 100644 --- a/llama_stack/providers/registry/eval.py +++ b/llama_stack/providers/registry/eval.py @@ -14,7 +14,7 @@ def available_providers() -> List[ProviderSpec]: InlineProviderSpec( api=Api.eval, provider_type="inline::meta-reference", - pip_packages=["tree_sitter"], + pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk"], module="llama_stack.providers.inline.eval.meta_reference", config_class="llama_stack.providers.inline.eval.meta_reference.MetaReferenceEvalConfig", api_dependencies=[ diff --git a/llama_stack/providers/utils/scoring/aggregation_utils.py b/llama_stack/providers/utils/scoring/aggregation_utils.py index 6686e4ade..7254c9433 100644 --- a/llama_stack/providers/utils/scoring/aggregation_utils.py +++ b/llama_stack/providers/utils/scoring/aggregation_utils.py @@ -28,6 +28,17 @@ def aggregate_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any] } +def aggregate_weighted_average(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]: + return { + "weighted_average": sum( + result["score"] * result["weight"] + for result in scoring_results + if result["score"] is not None and result["weight"] is not None + ) + / sum(result["weight"] for result in scoring_results if result["weight"] is not None), + } + + def aggregate_categorical_count( scoring_results: List[ScoringResultRow], ) -> Dict[str, Any]: @@ -46,6 +57,7 @@ def aggregate_median(scoring_results: List[ScoringResultRow]) -> Dict[str, Any]: AGGREGATION_FUNCTIONS = { AggregationFunctionType.accuracy: aggregate_accuracy, AggregationFunctionType.average: aggregate_average, + AggregationFunctionType.weighted_average: aggregate_weighted_average, AggregationFunctionType.categorical_count: aggregate_categorical_count, AggregationFunctionType.median: aggregate_median, } diff --git a/llama_stack/templates/open-benchmark/open_benchmark.py b/llama_stack/templates/open-benchmark/open_benchmark.py index d1c27e901..8d4b81792 100644 --- a/llama_stack/templates/open-benchmark/open_benchmark.py +++ b/llama_stack/templates/open-benchmark/open_benchmark.py @@ -203,6 +203,13 @@ def get_distribution_template() -> DistributionTemplate: uri="huggingface://datasets/llamastack/bfcl_v3?split=train", ), ), + DatasetInput( + dataset_id="ifeval", + purpose=DatasetPurpose.eval_messages_answer, + source=URIDataSource( + uri="huggingface://datasets/llamastack/IfEval?split=train", + ), + ), DatasetInput( dataset_id="docvqa", purpose=DatasetPurpose.eval_messages_answer, @@ -238,6 +245,11 @@ def get_distribution_template() -> DistributionTemplate: dataset_id="bfcl", scoring_functions=["basic::bfcl"], ), + BenchmarkInput( + benchmark_id="meta-reference-ifeval", + dataset_id="ifeval", + scoring_functions=["basic::ifeval"], + ), BenchmarkInput( benchmark_id="meta-reference-docvqa", dataset_id="docvqa", diff --git a/llama_stack/templates/open-benchmark/run.yaml b/llama_stack/templates/open-benchmark/run.yaml index 80a517fe8..a7136c596 100644 --- a/llama_stack/templates/open-benchmark/run.yaml +++ b/llama_stack/templates/open-benchmark/run.yaml @@ -188,6 +188,12 @@ datasets: uri: huggingface://datasets/llamastack/bfcl_v3?split=train metadata: {} dataset_id: bfcl +- purpose: eval/messages-answer + source: + type: uri + uri: huggingface://datasets/llamastack/IfEval?split=train + metadata: {} + dataset_id: ifeval - purpose: eval/messages-answer source: type: uri @@ -221,6 +227,11 @@ benchmarks: - basic::bfcl metadata: {} benchmark_id: meta-reference-bfcl +- dataset_id: ifeval + scoring_functions: + - basic::ifeval + metadata: {} + benchmark_id: meta-reference-ifeval - dataset_id: docvqa scoring_functions: - basic::docvqa