switch logic · EvolvingLMMs-Lab/lmms-eval@043b483 (original) (raw)
`@@ -6,6 +6,9 @@
`
6
6
`from spacy.cli import download
`
7
7
`from nltk.util import ngrams
`
8
8
`from functools import partial
`
``
9
`+
import datetime
`
``
10
`+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
`
``
11
`+
import json
`
9
12
``
10
13
`# Download the English and Chinese models
`
11
14
`download("en_core_web_sm")
`
`@@ -46,7 +49,7 @@ def fast_filter(answer_text):
`
46
49
``
47
50
``
48
51
`def vcr_doc_to_visual(doc):
`
49
``
`-
return [doc["stacked_image"].convert("RGB"), doc["only_it_image"].convert("RGB")]
`
``
52
`+
return [doc["stacked_image"].convert("RGB")]
`
50
53
``
51
54
``
52
55
`def vcr_doc_to_text(doc, model_specific_prompt_kwargs=None):
`
`@@ -80,7 +83,7 @@ def vcr_process_results_single(crossed_text, result, language):
`
80
83
` doc: a instance of the eval dataset
`
81
84
` results: [pred]
`
82
85
` Returns:
`
83
``
`-
a dictionary with key: metric name (in this case mme score), value: metric value
`
``
86
`+
a dictionary with key: metric name (in this case vcr score), value: metric value
`
84
87
` """
`
85
88
``
86
89
`assert language in ["en", "zh"], f"Language {language} is not supported."
`
`@@ -171,29 +174,28 @@ def vcr_en_process_results(doc, results):
`
171
174
` doc: a instance of the eval dataset
`
172
175
` results: [pred]
`
173
176
` Returns:
`
174
``
`-
a dictionary with key: metric name (in this case mme score), value: metric value
`
``
177
`+
a dictionary with key: metric name (in this case vcr score), value: metric value
`
175
178
` """
`
176
``
`-
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
`
177
``
`-
output = {}
`
178
``
`-
for i in range(len(doc["crossed_text"])):
`
179
``
`-
res_stacked_image_results = vcr_process_results_single(
`
180
``
`-
doc["crossed_text"][i], results[0], "en"
`
181
``
`-
)
`
182
``
`-
res_only_image_results = vcr_process_results_single(
`
183
``
`-
doc["crossed_text"][i], results[1], "en"
`
184
``
`-
)
`
185
``
`-
output.update(
`
186
``
`-
{
`
187
``
`-
f"res_stacked_image__{k}___{i}": v
`
188
``
`-
for k, v in res_stacked_image_results.items()
`
189
``
`-
}
`
190
``
`-
)
`
191
``
`-
output.update(
`
192
``
`-
{
`
193
``
`-
f"res_only_it_image__{k}___{i}": v
`
194
``
`-
for k, v in res_only_image_results.items()
`
195
``
`-
}
`
196
``
`-
)
`
``
179
`+
output = {
`
``
180
`+
"max_sim_val": [],
`
``
181
`+
"precision": [],
`
``
182
`+
"recall": [],
`
``
183
`+
"f1": [],
`
``
184
`+
"jaccard": [],
`
``
185
`+
"rouge1": [],
`
``
186
`+
"exact_match": [],
`
``
187
`+
}
`
``
188
`+
crossed_text = doc["crossed_text"]
`
``
189
`+
for i in range(len(crossed_text)):
`
``
190
`+
tmp = vcr_process_results_single(crossed_text[i], results, "en")
`
``
191
`+
for k in output.keys():
`
``
192
`+
output[k].append(
`
``
193
`+
{
`
``
194
`+
"score": tmp[k],
`
``
195
`+
"max_sim_string": tmp["max_sim_string"],
`
``
196
`+
"caption": doc["caption"],
`
``
197
`+
}
`
``
198
`+
)
`
197
199
`return output
`
198
200
``
199
201
``
`@@ -203,62 +205,51 @@ def vcr_zh_process_results(doc, results):
`
203
205
` doc: a instance of the eval dataset
`
204
206
` results: [pred]
`
205
207
` Returns:
`
206
``
`-
a dictionary with key: metric name (in this case mme score), value: metric value
`
``
208
`+
a dictionary with key: metric name (in this case vcr score), value: metric value
`
207
209
` """
`
208
``
`-
assert len(results) == 2, f"Expected 2 results, got {len(results)}"
`
209
``
`-
output = {}
`
210
``
`-
for i in range(len(doc["crossed_text"])):
`
211
``
`-
res_stacked_image_results = vcr_process_results_single(
`
212
``
`-
doc["crossed_text"][i], results[0], "zh"
`
213
``
`-
)
`
214
``
`-
res_only_image_results = vcr_process_results_single(
`
215
``
`-
doc["crossed_text"][i], results[1], "zh"
`
216
``
`-
)
`
217
``
`-
output.update(
`
218
``
`-
{
`
219
``
`-
f"res_stacked_image__{k}___{i}": v
`
220
``
`-
for k, v in res_stacked_image_results.items()
`
221
``
`-
}
`
222
``
`-
)
`
223
``
`-
output.update(
`
224
``
`-
{
`
225
``
`-
f"res_only_it_image__{k}___{i}": v
`
226
``
`-
for k, v in res_only_image_results.items()
`
227
``
`-
}
`
228
``
`-
)
`
``
210
`+
output = {
`
``
211
`+
"max_sim_val": [],
`
``
212
`+
"precision": [],
`
``
213
`+
"recall": [],
`
``
214
`+
"f1": [],
`
``
215
`+
"jaccard": [],
`
``
216
`+
"rouge1": [],
`
``
217
`+
"exact_match": [],
`
``
218
`+
}
`
``
219
`+
crossed_text = doc["crossed_text"]
`
``
220
`+
for i in range(len(crossed_text)):
`
``
221
`+
tmp = vcr_process_results_single(crossed_text[i], results, "zh")
`
``
222
`+
for k in output.keys():
`
``
223
`+
output[k].append(
`
``
224
`+
{
`
``
225
`+
"score": tmp[k],
`
``
226
`+
"max_sim_string": tmp["max_sim_string"],
`
``
227
`+
"caption": doc["caption"],
`
``
228
`+
}
`
``
229
`+
)
`
229
230
`return output
`
230
231
``
231
232
``
232
``
`-
def vcr_aggregate_results(results):
`
``
233
`+
def vcr_aggregate_results(results, args):
`
233
234
`"""
`
234
235
` Args:
`
235
236
` results: a list of values returned by process_results
`
236
237
` Returns:
`
237
238
` A dictionary of dictionary of float, where the outer dictionary has keys "res_stacked_image" and "res_only_it_image"
`
238
239
` """
`
239
``
`-
output = {
`
240
``
`-
"res_stacked_image__precision": 0,
`
241
``
`-
"res_stacked_image__recall": 0,
`
242
``
`-
"res_stacked_image__f1": 0,
`
243
``
`-
"res_stacked_image__jaccard": 0,
`
244
``
`-
"res_stacked_image__rouge1": 0,
`
245
``
`-
"res_stacked_image__exact_match": 0,
`
246
``
`-
"res_only_it_image__precision": 0,
`
247
``
`-
"res_only_it_image__recall": 0,
`
248
``
`-
"res_only_it_image__f1": 0,
`
249
``
`-
"res_only_it_image__jaccard": 0,
`
250
``
`-
"res_only_it_image__rouge1": 0,
`
251
``
`-
"res_only_it_image__exact_match": 0,
`
252
``
`-
}
`
253
``
-
254
``
`-
for output_key in output.keys():
`
255
``
`-
count = 0
`
256
``
`-
query_domain, query_metric_name = output_key.split("__")
`
257
``
`-
for inner_dict in results:
`
258
``
`-
for inner_key, inner_value in inner_dict.items():
`
259
``
`-
key_domain, key_metric_name, _ = inner_key.split("__")
`
260
``
`-
if query_domain == key_domain and query_metric_name == key_metric_name:
`
261
``
`-
output[output_key] += inner_value
`
262
``
`-
count += 1
`
263
``
`-
output[output_key] /= count
`
264
``
`-
return output
`
``
240
`+
scores = 0
`
``
241
`+
count = 0
`
``
242
`+
output_dict = {}
`
``
243
`+
for i in range(len(results)):
`
``
244
`+
for blank_id in range(len(results[i])):
`
``
245
`+
scores += results[i][blank_id]["score"]
`
``
246
`+
count += 1
`
``
247
`+
output_dict[str(i)] = results[i]
`
``
248
+
``
249
`+
now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
`
``
250
`+
path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)
`
``
251
`+
with open(path, "w") as f:
`
``
252
`+
json.dump(output_dict, f)
`
``
253
`+
print(f"Submission file saved to {path}")
`
``
254
`+
eval_logger.info(f"Submission file saved to {path}")
`
``
255
`+
return scores / count
`