Add Muirbench (#143) · EvolvingLMMs-Lab/lmms-eval@5fc5f2f (original) (raw)
``
1
+
``
2
`+
from lmms_eval.filters.extraction import ExtendedRegexFilter
`
``
3
`+
from lmms_eval.filters.transformation import MapFilter
`
``
4
`+
import re
`
``
5
`+
import pandas as pd
`
``
6
+
``
7
+
``
8
`+
def muir_doc_to_text(doc, model_specific_prompt_kwargs=None):
`
``
9
`+
question, choices = doc["question"], doc["options"]
`
``
10
`+
len_choices = len(choices)
`
``
11
`+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
`
``
12
`+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
`
``
13
`+
options = [chr(ord("A") + i) for i in range(len_choices)]
`
``
14
`+
choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])
`
``
15
`+
return f"{pre_prompt}{question}\n{choices_str}{post_prompt}"
`
``
16
+
``
17
+
``
18
`+
def muir_doc_to_visual(doc):
`
``
19
`+
image_list = [image.convert("RGB") for image in doc["image_list"]]
`
``
20
`+
return image_list
`
``
21
+
``
22
+
``
23
`+
def muir_doc_to_target(doc):
`
``
24
`+
return doc["answer"]
`
``
25
+
``
26
+
``
27
`+
def muir_process_results(doc, result):
`
``
28
`+
pred = result[0]
`
``
29
`+
task = doc["task"]
`
``
30
`+
idx = doc["idx"]
`
``
31
`+
image_relation = doc["image_relation"]
`
``
32
`+
answer = doc["answer"]
`
``
33
`+
image_type = doc["image_type"]
`
``
34
+
``
35
`+
data_dict = {
`
``
36
`+
"pred" : pred,
`
``
37
`+
"task" : task,
`
``
38
`+
"idx" : idx,
`
``
39
`+
"image_relation" : image_relation,
`
``
40
`+
"answer" : answer,
`
``
41
`+
"image_type" : image_type,
`
``
42
`+
}
`
``
43
+
``
44
`+
return {"muirbench_score_overall" : data_dict}
`
``
45
+
``
46
+
``
47
`+
def muir_aggregation(results):
`
``
48
`+
task_num = {}
`
``
49
`+
score = 0
`
``
50
`+
task_score = {}
`
``
51
`+
for result in results:
`
``
52
`+
if result["task"] not in task_score:
`
``
53
`+
task_score[result["task"]] = 0
`
``
54
+
``
55
`+
if result["task"] not in task_num:
`
``
56
`+
task_num[result["task"]] = 0
`
``
57
+
``
58
`+
if result["pred"].lower().strip() == result["answer"].lower().strip():
`
``
59
`+
task_score[result["task"]] += 1
`
``
60
`+
score += 1
`
``
61
`+
task_num[result["task"]] += 1
`
``
62
+
``
63
`+
score = score / len(results)
`
``
64
+
``
65
`+
task_score = {k : v / task_num[k] for k,v in task_score.items()}
`
``
66
+
``
67
`+
print("=" * 50)
`
``
68
`+
for k, v in task_score.items():
`
``
69
`+
print(f"{k} : {v:.2f}")
`
``
70
`+
print("=" * 50)
`
``
71
+
``
72
`+
return score
`
``
73
+
``
74
+
``
75
+
``
76
+
``
77
`+
class MultiChoiceRegexFilter(ExtendedRegexFilter):
`
``
78
`+
def init(self, *args, **kwargs):
`
``
79
`+
"""
`
``
80
`+
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
`
``
81
`+
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
`
``
82
`+
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
`
``
83
`+
group_select: Selects the (group_select)th match from the findall result.
`
``
84
`+
ignore_case: Ignores the case during step 1 matching
`
``
85
`+
ignore_punctuation: Remove the punctuation during step 1 matching
`
``
86
`+
regexes_to_ignore: Remove these regexes during step 1 matching
`
``
87
`+
"""
`
``
88
`+
super().init(*args, **kwargs)
`
``
89
+
``
90
`+
def apply(self, resps, docs):
`
``
91
`+
here, we assume we have a list, in which each element is
`
``
92
`+
a list of model responses for some particular input/target pair.
`
``
93
`+
so we process each of these (same input/target response sets)
`
``
94
`+
independently (and keep them a list.)
`
``
95
+
``
96
`+
filtered_resps = []
`
``
97
+
``
98
`+
for r, doc in zip(resps, docs):
`
``
99
`+
Regex to directly extract the option letter from the model response
`
``
100
`+
option_letter_regex = re.compile(r"^\s*([A-Z]).")
`
``
101
+
``
102
`+
Process each response
`
``
103
`+
filtered = []
`
``
104
`+
for resp in r:
`
``
105
`+
Try to match the option letter at the start of the response
`
``
106
`+
match = option_letter_regex.match(resp)
`
``
107
`+
if match:
`
``
108
`+
If a match is found, append the matched letter
`
``
109
`+
filtered.append(match.group(1))
`
``
110
`+
else:
`
``
111
`+
If no match, return the original response
`
``
112
`+
filtered.append(resp)
`
``
113
+
``
114
`+
Assuming we need the first response that matches or the original response
`
``
115
`+
filtered_resps.append(filtered[0])
`
``
116
+
``
117
`+
return filtered_resps
`