modify the form of VCR · EvolvingLMMs-Lab/lmms-eval@e1f04db (original) (raw)
1
``
`-
from collections import defaultdict
`
2
1
`import os
`
3
2
`from difflib import SequenceMatcher as SM
`
4
``
`-
import datetime
`
5
``
`-
import json
`
6
``
`-
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
`
7
3
`import evaluate
`
8
4
`import logging
`
9
5
`import spacy
`
34
30
`}
`
35
31
``
36
32
``
``
33
`+
def fast_filter(answer_text):
`
``
34
`+
if "I can't" in answer_text:
`
``
35
`+
return True
`
``
36
`+
elif "I cannot" in answer_text:
`
``
37
`+
return True
`
``
38
`+
elif "sorry" in answer_text.lower():
`
``
39
`+
return True
`
``
40
`+
if "无法" in answer_text:
`
``
41
`+
return True
`
``
42
`+
elif "抱歉" in answer_text:
`
``
43
`+
return True
`
``
44
`+
else:
`
``
45
`+
return False
`
``
46
+
``
47
+
37
48
`def vcr_doc_to_visual(doc):
`
38
49
`return [doc["stacked_image"].convert("RGB"), doc["only_it_image"].convert("RGB")]
`
39
50
``
`@@ -63,16 +74,29 @@ def tokenize(text, language):
`
63
74
`return [token.text for token in processed_text]
`
64
75
``
65
76
``
66
``
`-
def vcr_process_results_single(doc, result, language):
`
``
77
`+
def vcr_process_results_single(crossed_text, result, language):
`
67
78
`"""
`
68
79
` Args:
`
69
80
` doc: a instance of the eval dataset
`
70
81
` results: [pred]
`
71
82
` Returns:
`
72
83
` a dictionary with key: metric name (in this case mme score), value: metric value
`
73
84
` """
`
``
85
+
74
86
`assert language in ["en", "zh"], f"Language {language} is not supported."
`
75
``
`-
crossed_text = doc["crossed_text"]
`
``
87
+
``
88
`+
if fast_filter(result):
`
``
89
`+
return {
`
``
90
`+
"crossed_text": crossed_text,
`
``
91
`+
"max_sim_val": 0,
`
``
92
`+
"max_sim_string": "",
`
``
93
`+
"precision": 0,
`
``
94
`+
"recall": 0,
`
``
95
`+
"f1": 0,
`
``
96
`+
"jaccard": 0,
`
``
97
`+
"rouge1": 0,
`
``
98
`+
"exact_match": 0,
`
``
99
`+
}
`
76
100
`tokens_result = tokenize(result, language)
`
77
101
`tokens_crossed_text = tokenize(crossed_text, language)
`
78
102
``
`@@ -150,10 +174,26 @@ def vcr_en_process_results(doc, results):
`
150
174
` a dictionary with key: metric name (in this case mme score), value: metric value
`
151
175
` """
`
152
176
`assert len(results) == 2, f"Expected 2 results, got {len(results)}"
`
153
``
`-
output = {
`
154
``
`-
"res_stacked_image": vcr_process_results_single(doc, results[0], "en"),
`
155
``
`-
"res_only_it_image": vcr_process_results_single(doc, results[1], "en"),
`
156
``
`-
}
`
``
177
`+
output = {}
`
``
178
`+
for i in range(len(doc["crossed_text"])):
`
``
179
`+
res_stacked_image_results = vcr_process_results_single(
`
``
180
`+
doc["crossed_text"][i], results[0], "en"
`
``
181
`+
)
`
``
182
`+
res_only_image_results = vcr_process_results_single(
`
``
183
`+
doc["crossed_text"][i], results[1], "en"
`
``
184
`+
)
`
``
185
`+
output.update(
`
``
186
`+
{
`
``
187
`+
f"res_stacked_image__{k}___{i}": v
`
``
188
`+
for k, v in res_stacked_image_results.items()
`
``
189
`+
}
`
``
190
`+
)
`
``
191
`+
output.update(
`
``
192
`+
{
`
``
193
`+
f"res_only_it_image__{k}___{i}": v
`
``
194
`+
for k, v in res_only_image_results.items()
`
``
195
`+
}
`
``
196
`+
)
`
157
197
`return output
`
158
198
``
159
199
``
`@@ -166,10 +206,26 @@ def vcr_zh_process_results(doc, results):
`
166
206
` a dictionary with key: metric name (in this case mme score), value: metric value
`
167
207
` """
`
168
208
`assert len(results) == 2, f"Expected 2 results, got {len(results)}"
`
169
``
`-
output = {
`
170
``
`-
"res_stacked_image": vcr_process_results_single(doc, results[0], "zh"),
`
171
``
`-
"res_only_it_image": vcr_process_results_single(doc, results[1], "zh"),
`
172
``
`-
}
`
``
209
`+
output = {}
`
``
210
`+
for i in range(len(doc["crossed_text"])):
`
``
211
`+
res_stacked_image_results = vcr_process_results_single(
`
``
212
`+
doc["crossed_text"][i], results[0], "zh"
`
``
213
`+
)
`
``
214
`+
res_only_image_results = vcr_process_results_single(
`
``
215
`+
doc["crossed_text"][i], results[1], "zh"
`
``
216
`+
)
`
``
217
`+
output.update(
`
``
218
`+
{
`
``
219
`+
f"res_stacked_image__{k}___{i}": v
`
``
220
`+
for k, v in res_stacked_image_results.items()
`
``
221
`+
}
`
``
222
`+
)
`
``
223
`+
output.update(
`
``
224
`+
{
`
``
225
`+
f"res_only_it_image__{k}___{i}": v
`
``
226
`+
for k, v in res_only_image_results.items()
`
``
227
`+
}
`
``
228
`+
)
`
173
229
`return output
`
174
230
``
175
231
``
`@@ -180,36 +236,29 @@ def vcr_aggregate_results(results):
`
180
236
` Returns:
`
181
237
` A dictionary of dictionary of float, where the outer dictionary has keys "res_stacked_image" and "res_only_it_image"
`
182
238
` """
`
183
``
-
184
239
`output = {
`
185
``
`-
"res_stacked_image": {
`
186
``
`-
"max_sim_val": 0,
`
187
``
`-
"precision": 0,
`
188
``
`-
"recall": 0,
`
189
``
`-
"f1": 0,
`
190
``
`-
"jaccard": 0,
`
191
``
`-
"rouge1": 0,
`
192
``
`-
},
`
193
``
`-
"res_only_it_image": {
`
194
``
`-
"max_sim_val": 0,
`
195
``
`-
"precision": 0,
`
196
``
`-
"recall": 0,
`
197
``
`-
"f1": 0,
`
198
``
`-
"jaccard": 0,
`
199
``
`-
"rouge1": 0,
`
200
``
`-
},
`
``
240
`+
"res_stacked_image__precision": 0,
`
``
241
`+
"res_stacked_image__recall": 0,
`
``
242
`+
"res_stacked_image__f1": 0,
`
``
243
`+
"res_stacked_image__jaccard": 0,
`
``
244
`+
"res_stacked_image__rouge1": 0,
`
``
245
`+
"res_stacked_image__exact_match": 0,
`
``
246
`+
"res_only_it_image__precision": 0,
`
``
247
`+
"res_only_it_image__recall": 0,
`
``
248
`+
"res_only_it_image__f1": 0,
`
``
249
`+
"res_only_it_image__jaccard": 0,
`
``
250
`+
"res_only_it_image__rouge1": 0,
`
``
251
`+
"res_only_it_image__exact_match": 0,
`
201
252
` }
`
202
``
`-
for target_domain in output.keys():
`
203
``
`-
for target_metric_name in output[target_domain].keys():
`
204
``
`-
score = 0
`
205
``
`-
count = 0
`
206
``
`-
for inner_dict in results:
`
207
``
`-
for inner_key, inner_value in inner_dict.items():
`
208
``
`-
if inner_key == target_domain:
`
209
``
`-
for blank_id, blank_metrics in inner_value.items():
`
210
``
`-
for metric_name, metric_value in blank_metrics.items():
`
211
``
`-
if metric_name == target_metric_name:
`
212
``
`-
score += metric_value
`
213
``
`-
count += 1
`
214
``
`-
output[target_domain][target_metric_name] = score / count
`
``
253
+
``
254
`+
for output_key in output.keys():
`
``
255
`+
count = 0
`
``
256
`+
query_domain, query_metric_name = output_key.split("__")
`
``
257
`+
for inner_dict in results:
`
``
258
`+
for inner_key, inner_value in inner_dict.items():
`
``
259
`+
key_domain, key_metric_name, _ = inner_key.split("__")
`
``
260
`+
if query_domain == key_domain and query_metric_name == key_metric_name:
`
``
261
`+
output[output_key] += inner_value
`
``
262
`+
count += 1
`
``
263
`+
output[output_key] /= count
`
215
264
`return output
`