[WIP] adding mmbench dev evaluation (#75) · EvolvingLMMs-Lab/lmms-eval@a19278c (original) (raw)

`@@ -9,7 +9,7 @@

`

9

9

`from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator

`

10

10

`from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

`

11

11

``

12

``

`-

with open(Path(file).parent / "mmbench_en.yaml", "r") as f:

`

``

12

`+

with open(Path(file).parent / "mmbench.yaml", "r") as f:

`

13

13

`raw_data = f.readlines()

`

14

14

`safe_data = []

`

15

15

`for i, line in enumerate(raw_data):

`

`@@ -19,7 +19,18 @@

`

19

19

``

20

20

`config = yaml.safe_load("".join(safe_data))

`

21

21

``

22

``

`-

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"])

`

``

22

`+

GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]

`

``

23

`+

API_TYPE = os.getenv("API_TYPE", "openai")

`

``

24

+

``

25

`+

if API_TYPE == "openai":

`

``

26

`+

API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")

`

``

27

`+

API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")

`

``

28

`+

elif API_TYPE == "azure":

`

``

29

`+

API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")

`

``

30

`+

API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")

`

``

31

+

``

32

+

``

33

`+

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)

`

23

34

``

24

35

``

25

36

`def mmbench_doc_to_visual(doc):

`

`@@ -54,7 +65,18 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):

`

54

65

``

55

66

`def mmbench_process_results(doc, results):

`

56

67

`model_response = results[0].strip()

`

57

``

`-

return {

`

``

68

`+

data = {

`

``

69

`+

"gpt_eval_score": {

`

``

70

`+

"index": doc["index"],

`

``

71

`+

"question": doc["question"],

`

``

72

`+

"answer": doc["answer"],

`

``

73

`+

"prediction": model_response,

`

``

74

`+

"hint": doc["hint"],

`

``

75

`+

"source": doc["source"],

`

``

76

`+

"split": doc["split"],

`

``

77

`+

"category": doc["category"],

`

``

78

`+

"L2-category": doc["L2-category"],

`

``

79

`+

},

`

58

80

`"submission": {

`

59

81

`"index": doc["index"],

`

60

82

`"question": doc["question"],

`

`@@ -65,11 +87,30 @@ def mmbench_process_results(doc, results):

`

65

87

`"split": doc["split"],

`

66

88

`"category": doc["category"],

`

67

89

`"L2-category": doc["L2-category"],

`

68

``

`-

}

`

``

90

`+

},

`

``

91

`+

}

`

``

92

`+

option_candidate = ["A", "B", "C", "D", "E"]

`

``

93

`+

for c in option_candidate:

`

``

94

`+

data["submission"][c] = doc.get(c, "nan")

`

``

95

`+

data["gpt_eval_score"][c] = doc.get(c, "nan")

`

``

96

`+

return data

`

``

97

+

``

98

+

``

99

`+

def mmbench_aggregate_dev_results_eval(results, args):

`

``

100

`+

print(f"============= MMBench-EN(Dev) Detailed Results =============")

`

``

101

`+

overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")

`

``

102

`+

file = generate_submission_file("mmbench_en_dev_results.json", args)

`

``

103

`+

details_info = {

`

``

104

`+

"overall_acc": overall_acc,

`

``

105

`+

"category_acc": category_acc,

`

``

106

`+

"l2_category_acc": l2_category_acc,

`

69

107

` }

`

``

108

`+

with open(file, "w") as f:

`

``

109

`+

json.dump(details_info, f)

`

``

110

`+

return overall_acc * 100

`

70

111

``

71

112

``

72

``

`-

def mmbench_aggregate_dev_results(results, args):

`

``

113

`+

def mmbench_aggregate_dev_results_submission(results, args):

`

73

114

`df = pd.DataFrame(results)

`

74

115

`excel_write_path = generate_submission_file("mmbench_en_dev_results.xlsx", args)

`

75

116

`with pd.ExcelWriter(excel_write_path) as writer:

`