Merge pull request #129 from Dannoopsy/mmbench_ru · EvolvingLMMs-Lab/lmms-eval@39d40de (original) (raw)
``
1
`+
import yaml
`
``
2
`+
import os
`
``
3
`+
from pathlib import Path
`
``
4
`+
import pandas as pd
`
``
5
`+
import json
`
``
6
+
``
7
`+
from loguru import logger as eval_logger
`
``
8
`+
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
`
``
9
`+
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
`
``
10
+
``
11
`+
with open(Path(file).parent / "mmbench.yaml", "r") as f:
`
``
12
`+
raw_data = f.readlines()
`
``
13
`+
safe_data = []
`
``
14
`+
for i, line in enumerate(raw_data):
`
``
15
`+
remove function definition since yaml load cannot handle it
`
``
16
`+
if "!function" not in line:
`
``
17
`+
safe_data.append(line)
`
``
18
+
``
19
`+
config = yaml.safe_load("".join(safe_data))
`
``
20
+
``
21
`+
GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
`
``
22
`+
API_TYPE = os.getenv("API_TYPE", "openai")
`
``
23
+
``
24
`+
if API_TYPE == "openai":
`
``
25
`+
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
`
``
26
`+
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
`
``
27
`+
elif API_TYPE == "azure":
`
``
28
`+
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
`
``
29
`+
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
`
``
30
`+
else:
`
``
31
`+
API_URL = "YOUR_API_URL"
`
``
32
`+
API_KEY = "YOUR_API_KEY"
`
``
33
+
``
34
+
``
35
`+
mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)
`
``
36
+
``
37
+
``
38
`+
def mmbench_doc_to_visual(doc):
`
``
39
`+
return [doc["image"].convert("RGB")]
`
``
40
+
``
41
+
``
42
`+
def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
`
``
43
`+
option_candidate = ["A", "B", "C", "D", "E"]
`
``
44
`+
options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
`
``
45
+
``
46
`+
data = {
`
``
47
`+
"img": doc["image"],
`
``
48
`+
"question": doc["question"],
`
``
49
`+
"answer": doc.get("answer", None),
`
``
50
`+
"options": options_prompt,
`
``
51
`+
"category": doc["category"],
`
``
52
`+
"L2-category": doc["l2-category"],
`
``
53
`+
"options_dict": options_dict,
`
``
54
`+
"index": doc["index"],
`
``
55
`+
"hint": doc["hint"],
`
``
56
`+
"source": doc["source"],
`
``
57
`+
"split": doc["split"],
`
``
58
`+
}
`
``
59
+
``
60
`+
query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}"
`
``
61
+
``
62
`+
if model_specific_prompt_kwargs:
`
``
63
`+
query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"
`
``
64
+
``
65
`+
return query_prompt
`
``
66
+
``
67
+
``
68
`+
def mmbench_process_results(doc, results):
`
``
69
`+
model_response = results[0].strip()
`
``
70
`+
data = {
`
``
71
`+
"gpt_eval_score": {
`
``
72
`+
"index": doc["index"],
`
``
73
`+
"question": doc["question"],
`
``
74
`+
"answer": doc["answer"],
`
``
75
`+
"prediction": model_response,
`
``
76
`+
"hint": doc["hint"],
`
``
77
`+
"source": doc["source"],
`
``
78
`+
"split": doc["split"],
`
``
79
`+
"category": doc["category"],
`
``
80
`+
"L2-category": doc["l2-category"],
`
``
81
`+
},
`
``
82
`+
"submission": {
`
``
83
`+
"index": doc["index"],
`
``
84
`+
"question": doc["question"],
`
``
85
`+
"answer": doc["answer"],
`
``
86
`+
"prediction": model_response,
`
``
87
`+
"hint": doc["hint"],
`
``
88
`+
"source": doc["source"],
`
``
89
`+
"split": doc["split"],
`
``
90
`+
"category": doc["category"],
`
``
91
`+
"L2-category": doc["l2-category"],
`
``
92
`+
},
`
``
93
`+
}
`
``
94
`+
option_candidate = ["A", "B", "C", "D", "E"]
`
``
95
`+
for c in option_candidate:
`
``
96
`+
data["submission"][c] = doc.get(c, "nan")
`
``
97
`+
data["gpt_eval_score"][c] = doc.get(c, "nan")
`
``
98
`+
return data
`
``
99
+
``
100
+
``
101
`+
def mmbench_aggregate_dev_results_eval(results, args):
`
``
102
`+
print(f"============= MMBench-RU(Dev) Detailed Results =============")
`
``
103
`+
overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
`
``
104
`+
file = generate_submission_file("mmbench_ru_dev_results.json", args)
`
``
105
`+
details_info = {
`
``
106
`+
"overall_acc": overall_acc,
`
``
107
`+
"category_acc": category_acc,
`
``
108
`+
"l2_category_acc": l2_category_acc,
`
``
109
`+
}
`
``
110
`+
with open(file, "w") as f:
`
``
111
`+
json.dump(details_info, f)
`
``
112
`+
return overall_acc * 100
`
``
113
+
``
114
+
``
115
`+
def mmbench_aggregate_dev_results_submission(results, args):
`
``
116
`+
df = pd.DataFrame(results)
`
``
117
`+
excel_write_path = generate_submission_file("mmbench_ru_dev_results.xlsx", args)
`
``
118
`+
with pd.ExcelWriter(excel_write_path) as writer:
`
``
119
`+
df.to_excel(writer, index=False)
`
``
120
`+
eval_logger.info(f"Saved results to {excel_write_path}")
`
``
121
+
``
122
+
``
123
`+
def mmbench_aggregate_test_results(results, args):
`
``
124
`+
df = pd.DataFrame(results)
`
``
125
`+
excel_write_path = generate_submission_file("mmbench_ru_test_results.xlsx", args)
`
``
126
`+
with pd.ExcelWriter(excel_write_path) as writer:
`
``
127
`+
df.to_excel(writer, index=False)
`
``
128
`+
eval_logger.info(f"Saved results to {excel_write_path}")
`