add II-Bench · EvolvingLMMs-Lab/lmms-eval@6248113 (original) (raw)
``
1
`+
import json
`
``
2
`+
import logging
`
``
3
`+
import re
`
``
4
`+
from collections import Counter
`
``
5
`+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
`
``
6
+
``
7
`+
PROMPT = """Question: {}
`
``
8
`+
(A) {}
`
``
9
`+
(B) {}
`
``
10
`+
(C) {}
`
``
11
`+
(D) {}
`
``
12
`+
(E) {}
`
``
13
`+
(F) {}"""
`
``
14
+
``
15
`+
def ii_bench_doc_to_text(doc, model_specific_prompt_kwargs):
`
``
16
`+
question = PROMPT.format(doc["question"], doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"])
`
``
17
`+
pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
`
``
18
`+
post_prompt = model_specific_prompt_kwargs["post_prompt"]
`
``
19
`+
return f"{pre_prompt}{question}{post_prompt}"
`
``
20
+
``
21
+
``
22
`+
def ii_bench_doc_to_visual(doc):
`
``
23
`+
return [doc["image"].convert("RGB")]
`
``
24
+
``
25
+
``
26
`+
def extract_option_labels(text, options=None):
`
``
27
`+
if isinstance(text, dict):
`
``
28
`+
return 'error'
`
``
29
`+
pattern = r"(([A-F]))"
`
``
30
`+
matches = re.findall(pattern, text)
`
``
31
+
``
32
`+
if not matches:
`
``
33
`+
pattern = r"\b([A-F])\b"
`
``
34
`+
matches = re.findall(pattern, text)
`
``
35
+
``
36
`+
if matches:
`
``
37
`+
counter = Counter(matches)
`
``
38
`+
most_common = counter.most_common()
`
``
39
`+
max_count = most_common[0][1]
`
``
40
`+
candidates = [item for item in most_common if item[1] == max_count]
`
``
41
`+
return candidates[-1][0]
`
``
42
`+
else:
`
``
43
`+
if options:
`
``
44
`+
counter = Counter()
`
``
45
`+
for i, option in enumerate(options, start=1):
`
``
46
`+
label = chr(64 + i)
`
``
47
`+
option_stripped = option.strip()
`
``
48
`+
if option_stripped in text:
`
``
49
`+
counter[label] += 1
`
``
50
`+
elif text in option:
`
``
51
`+
counter[label] += 1
`
``
52
`+
if counter:
`
``
53
`+
most_common = counter.most_common()
`
``
54
`+
max_count = most_common[0][1]
`
``
55
`+
candidates = [item for item in most_common if item[1] == max_count]
`
``
56
`+
return candidates[-1][0]
`
``
57
`+
return None
`
``
58
+
``
59
+
``
60
`+
def ii_bench_process_results(doc, results):
`
``
61
`+
response = results[0]
`
``
62
`+
predict = extract_option_labels(response, [doc["option1"], doc["option2"], doc["option3"], doc["option4"], doc["option5"], doc["option6"]])
`
``
63
`+
return {"submission": {"id": doc["id"], "predict_answer": predict, "response": response}}
`
``
64
+
``
65
+
``
66
`+
def ii_bench_aggregate_submissions(results, args):
`
``
67
`+
file = generate_submission_file("ii_bench_test_for_submission.json", args)
`
``
68
`+
with open(file, "w") as f:
`
``
69
`+
json.dump(results, f, indent=4)
`
``
70
`+
logging.getLogger("lmms-eval").info(f"Results saved to {file}")
`