Skip to content

Commit f888858

Browse files
NathanHBjgyasu
andauthored
bbeh (#1124)
* [EVAL] BIG-Bench Extra Hard * update * fixing the prompt * Apply suggestion from @NathanHB * Apply suggestion from @NathanHB --------- Co-authored-by: Jigyasu <jigyasu@outlook.in>
1 parent 61c547b commit f888858

File tree

1 file changed

+241
-0
lines changed

1 file changed

+241
-0
lines changed
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
"""
2+
name:
3+
BIG-Bench Extra Hard
4+
5+
dataset:
6+
jgyasu/bbeh
7+
8+
abstract:
9+
BIG-Bench Extra Hard (BBEH) is a successor to BIG-Bench Hard (BBH), created to evaluate large
10+
language models on substantially more difficult general-reasoning tasks. Each BBH task is replaced
11+
with a new task targeting the same underlying reasoning skill but at a significantly higher difficulty.
12+
13+
languages:
14+
english
15+
16+
tags:
17+
reasoning
18+
19+
paper:
20+
https://arxiv.org/abs/2502.19187
21+
22+
starred:
23+
true
24+
"""
25+
26+
from inspect_ai.dataset import Sample
27+
from inspect_ai.scorer import answer
28+
from inspect_ai.solver import generate, system_message
29+
30+
from lighteval.metrics.metrics import Metrics
31+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
32+
from lighteval.tasks.requests import Doc
33+
34+
35+
def bbeh_prompt(line, task_name: str = None):
36+
line = {k: v for k, v in line.items() if v is not None}
37+
38+
query = "Question: \n"
39+
query += line["input"]
40+
query += "\nAnswer:"
41+
42+
return Doc(
43+
task_name=task_name,
44+
query=query,
45+
choices=line["target"],
46+
gold_index=0,
47+
instruction="",
48+
)
49+
50+
51+
def record_to_sample(record):
52+
query = f"{record['input']}"
53+
target = record["target"]
54+
55+
return Sample(input=query, target=target)
56+
57+
58+
SYSTEM_MESSAGE = """Submit your answer in the following format:
59+
ANSWER: {your answer}
60+
"""
61+
62+
63+
COMMON_TASK_ARGS = {
64+
"prompt_function": bbeh_prompt,
65+
"hf_repo": "jgyasu/bbeh",
66+
"hf_avail_splits": ["train"],
67+
"evaluation_splits": ["train"],
68+
"few_shots_split": None,
69+
"few_shots_select": None,
70+
"generation_size": -1,
71+
"metrics": [Metrics.loglikelihood_acc],
72+
"stop_sequence": ["</s>", "Q=", "\n\n"],
73+
"version": 0,
74+
"sample_fields": record_to_sample,
75+
"solver": [system_message(SYSTEM_MESSAGE), generate(cache=True)],
76+
"scorer": answer(pattern="line"),
77+
}
78+
79+
boardgame_qa = LightevalTaskConfig(
80+
name="bigbench_extra_hard:boardgame_qa",
81+
hf_subset="boardgame_qa",
82+
**COMMON_TASK_ARGS,
83+
)
84+
85+
boolean_expressions = LightevalTaskConfig(
86+
name="bigbench_extra_hard:boolean_expressions",
87+
hf_subset="boolean_expressions",
88+
**COMMON_TASK_ARGS,
89+
)
90+
91+
buggy_tables = LightevalTaskConfig(
92+
name="bigbench_extra_hard:buggy_tables",
93+
hf_subset="buggy_tables",
94+
**COMMON_TASK_ARGS,
95+
)
96+
97+
causal_understanding = LightevalTaskConfig(
98+
name="bigbench_extra_hard:causal_understanding",
99+
hf_subset="causal_understanding",
100+
**COMMON_TASK_ARGS,
101+
)
102+
103+
disambiguation_qa = LightevalTaskConfig(
104+
name="bigbench_extra_hard:disambiguation_qa",
105+
hf_subset="disambiguation_qa",
106+
**COMMON_TASK_ARGS,
107+
)
108+
109+
dyck_languages = LightevalTaskConfig(
110+
name="bigbench_extra_hard:dyck_languages",
111+
hf_subset="dyck_languages",
112+
**COMMON_TASK_ARGS,
113+
)
114+
115+
geometric_shapes = LightevalTaskConfig(
116+
name="bigbench_extra_hard:geometric_shapes",
117+
hf_subset="geometric_shapes",
118+
**COMMON_TASK_ARGS,
119+
)
120+
121+
hyperbaton = LightevalTaskConfig(
122+
name="bigbench_extra_hard:hyperbaton",
123+
hf_subset="hyperbaton",
124+
**COMMON_TASK_ARGS,
125+
)
126+
127+
linguini = LightevalTaskConfig(
128+
name="bigbench_extra_hard:linguini",
129+
hf_subset="linguini",
130+
**COMMON_TASK_ARGS,
131+
)
132+
133+
movie_recommendation = LightevalTaskConfig(
134+
name="bigbench_extra_hard:movie_recommendation",
135+
hf_subset="movie_recommendation",
136+
**COMMON_TASK_ARGS,
137+
)
138+
139+
multistep_arithmetic = LightevalTaskConfig(
140+
name="bigbench_extra_hard:multistep_arithmetic",
141+
hf_subset="multistep_arithmetic",
142+
**COMMON_TASK_ARGS,
143+
)
144+
145+
nycc = LightevalTaskConfig(
146+
name="bigbench_extra_hard:nycc",
147+
hf_subset="nycc",
148+
**COMMON_TASK_ARGS,
149+
)
150+
151+
object_counting = LightevalTaskConfig(
152+
name="bigbench_extra_hard:object_counting",
153+
hf_subset="object_counting",
154+
**COMMON_TASK_ARGS,
155+
)
156+
157+
object_properties = LightevalTaskConfig(
158+
name="bigbench_extra_hard:object_properties",
159+
hf_subset="object_properties",
160+
**COMMON_TASK_ARGS,
161+
)
162+
163+
sarc_triples = LightevalTaskConfig(
164+
name="bigbench_extra_hard:sarc_triples",
165+
hf_subset="sarc_triples",
166+
**COMMON_TASK_ARGS,
167+
)
168+
169+
shuffled_objects = LightevalTaskConfig(
170+
name="bigbench_extra_hard:shuffled_objects",
171+
hf_subset="shuffled_objects",
172+
**COMMON_TASK_ARGS,
173+
)
174+
175+
spatial_reasoning = LightevalTaskConfig(
176+
name="bigbench_extra_hard:spatial_reasoning",
177+
hf_subset="spatial_reasoning",
178+
**COMMON_TASK_ARGS,
179+
)
180+
181+
sportqa = LightevalTaskConfig(
182+
name="bigbench_extra_hard:sportqa",
183+
hf_subset="sportqa",
184+
**COMMON_TASK_ARGS,
185+
)
186+
187+
temporal_sequence = LightevalTaskConfig(
188+
name="bigbench_extra_hard:temporal_sequence",
189+
hf_subset="temporal_sequence",
190+
**COMMON_TASK_ARGS,
191+
)
192+
193+
time_arithmetic = LightevalTaskConfig(
194+
name="bigbench_extra_hard:time_arithmetic",
195+
hf_subset="time_arithmetic",
196+
**COMMON_TASK_ARGS,
197+
)
198+
199+
web_of_lies = LightevalTaskConfig(
200+
name="bigbench_extra_hard:web_of_lies",
201+
hf_subset="web_of_lies",
202+
**COMMON_TASK_ARGS,
203+
)
204+
205+
word_sorting = LightevalTaskConfig(
206+
name="bigbench_extra_hard:word_sorting",
207+
hf_subset="word_sorting",
208+
**COMMON_TASK_ARGS,
209+
)
210+
211+
zebra_puzzles = LightevalTaskConfig(
212+
name="bigbench_extra_hard:zebra_puzzles",
213+
hf_subset="zebra_puzzles",
214+
**COMMON_TASK_ARGS,
215+
)
216+
217+
TASKS_TABLE = [
218+
boardgame_qa,
219+
boolean_expressions,
220+
buggy_tables,
221+
causal_understanding,
222+
disambiguation_qa,
223+
dyck_languages,
224+
geometric_shapes,
225+
hyperbaton,
226+
linguini,
227+
movie_recommendation,
228+
multistep_arithmetic,
229+
nycc,
230+
object_counting,
231+
object_properties,
232+
sarc_triples,
233+
shuffled_objects,
234+
spatial_reasoning,
235+
sportqa,
236+
temporal_sequence,
237+
time_arithmetic,
238+
web_of_lies,
239+
word_sorting,
240+
zebra_puzzles,
241+
]

0 commit comments

Comments
 (0)