[WIP] adding mmbench dev evaluation (#75) · EvolvingLMMs-Lab/lmms-eval@a19278c (original) (raw)
`@@ -9,7 +9,7 @@
`
9
9
`from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
`
10
10
`from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
`
11
11
``
12
``
`-
with open(Path(file).parent / "mmbench_en.yaml", "r") as f:
`
``
12
`+
with open(Path(file).parent / "mmbench.yaml", "r") as f:
`
13
13
`raw_data = f.readlines()
`
14
14
`safe_data = []
`
15
15
`for i, line in enumerate(raw_data):
`
`@@ -19,7 +19,18 @@
`
19
19
``
20
20
`config = yaml.safe_load("".join(safe_data))
`
21
21
``
22
``
`-
mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])
`
``
22
`+
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
`
``
23
`+
API_TYPE = os.getenv("API_TYPE", "openai")
`
``
24
+
``
25
`+
if API_TYPE == "openai":
`
``
26
`+
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
`
``
27
`+
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
`
``
28
`+
elif API_TYPE == "azure":
`
``
29
`+
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
`
``
30
`+
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
`
``
31
+
``
32
+
``
33
`+
mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)
`
23
34
``
24
35
``
25
36
`def mmbench_doc_to_visual(doc):
`
`@@ -54,7 +65,18 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
`
54
65
``
55
66
`def mmbench_process_results(doc, results):
`
56
67
`model_response = results[0].strip()
`
57
``
`-
return {
`
``
68
`+
data = {
`
``
69
`+
"gpt_eval_score": {
`
``
70
`+
"index": doc["index"],
`
``
71
`+
"question": doc["question"],
`
``
72
`+
"answer": doc["answer"],
`
``
73
`+
"prediction": model_response,
`
``
74
`+
"hint": doc["hint"],
`
``
75
`+
"source": doc["source"],
`
``
76
`+
"split": doc["split"],
`
``
77
`+
"category": doc["category"],
`
``
78
`+
"L2-category": doc["L2-category"],
`
``
79
`+
},
`
58
80
`"submission": {
`
59
81
`"index": doc["index"],
`
60
82
`"question": doc["question"],
`
`@@ -65,11 +87,30 @@ def mmbench_process_results(doc, results):
`
65
87
`"split": doc["split"],
`
66
88
`"category": doc["category"],
`
67
89
`"L2-category": doc["L2-category"],
`
68
``
`-
}
`
``
90
`+
},
`
``
91
`+
}
`
``
92
`+
option_candidate = ["A", "B", "C", "D", "E"]
`
``
93
`+
for c in option_candidate:
`
``
94
`+
data["submission"][c] = doc.get(c, "nan")
`
``
95
`+
data["gpt_eval_score"][c] = doc.get(c, "nan")
`
``
96
`+
return data
`
``
97
+
``
98
+
``
99
`+
def mmbench_aggregate_dev_results_eval(results, args):
`
``
100
`+
print(f"============= MMBench-EN(Dev) Detailed Results =============")
`
``
101
`+
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
`
``
102
`+
file = generate_submission_file("mmbench_en_dev_results.json", args)
`
``
103
`+
details_info = {
`
``
104
`+
"overall_acc": overall_acc,
`
``
105
`+
"category_acc": category_acc,
`
``
106
`+
"l2_category_acc": l2_category_acc,
`
69
107
` }
`
``
108
`+
with open(file, "w") as f:
`
``
109
`+
json.dump(details_info, f)
`
``
110
`+
return overall_acc * 100
`
70
111
``
71
112
``
72
``
`-
def mmbench_aggregate_dev_results(results, args):
`
``
113
`+
def mmbench_aggregate_dev_results_submission(results, args):
`
73
114
`df = pd.DataFrame(results)
`
74
115
`excel_write_path = generate_submission_file("mmbench_en_dev_results.xlsx", args)
`
75
116
`with pd.ExcelWriter(excel_write_path) as writer:
`