Merge pull request #129 from Dannoopsy/mmbench_ru · EvolvingLMMs-Lab/lmms-eval@39d40de (original) (raw)

``

1

`+

import yaml

`

``

2

`+

import os

`

``

3

`+

from pathlib import Path

`

``

4

`+

import pandas as pd

`

``

5

`+

import json

`

``

6

+

``

7

`+

from loguru import logger as eval_logger

`

``

8

`+

from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator

`

``

9

`+

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

`

``

10

+

``

11

`+

with open(Path(file).parent / "mmbench.yaml", "r") as f:

`

``

12

`+

raw_data = f.readlines()

`

``

13

`+

safe_data = []

`

``

14

`+

for i, line in enumerate(raw_data):

`

``

15

`+

remove function definition since yaml load cannot handle it

`

``

16

`+

if "!function" not in line:

`

``

17

`+

safe_data.append(line)

`

``

18

+

``

19

`+

config = yaml.safe_load("".join(safe_data))

`

``

20

+

``

21

`+

GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]

`

``

22

`+

API_TYPE = os.getenv("API_TYPE", "openai")

`

``

23

+

``

24

`+

if API_TYPE == "openai":

`

``

25

`+

API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")

`

``

26

`+

API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")

`

``

27

`+

elif API_TYPE == "azure":

`

``

28

`+

API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")

`

``

29

`+

API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")

`

``

30

`+

else:

`

``

31

`+

API_URL = "YOUR_API_URL"

`

``

32

`+

API_KEY = "YOUR_API_KEY"

`

``

33

+

``

34

+

``

35

`+

mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)

`

``

36

+

``

37

+

``

38

`+

def mmbench_doc_to_visual(doc):

`

``

39

`+

return [doc["image"].convert("RGB")]

`

``

40

+

``

41

+

``

42

`+

def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):

`

``

43

`+

option_candidate = ["A", "B", "C", "D", "E"]

`

``

44

`+

options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)

`

``

45

+

``

46

`+

data = {

`

``

47

`+

"img": doc["image"],

`

``

48

`+

"question": doc["question"],

`

``

49

`+

"answer": doc.get("answer", None),

`

``

50

`+

"options": options_prompt,

`

``

51

`+

"category": doc["category"],

`

``

52

`+

"L2-category": doc["l2-category"],

`

``

53

`+

"options_dict": options_dict,

`

``

54

`+

"index": doc["index"],

`

``

55

`+

"hint": doc["hint"],

`

``

56

`+

"source": doc["source"],

`

``

57

`+

"split": doc["split"],

`

``

58

`+

}

`

``

59

+

``

60

`+

query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}"

`

``

61

+

``

62

`+

if model_specific_prompt_kwargs:

`

``

63

`+

query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"

`

``

64

+

``

65

`+

return query_prompt

`

``

66

+

``

67

+

``

68

`+

def mmbench_process_results(doc, results):

`

``

69

`+

model_response = results[0].strip()

`

``

70

`+

data = {

`

``

71

`+

"gpt_eval_score": {

`

``

72

`+

"index": doc["index"],

`

``

73

`+

"question": doc["question"],

`

``

74

`+

"answer": doc["answer"],

`

``

75

`+

"prediction": model_response,

`

``

76

`+

"hint": doc["hint"],

`

``

77

`+

"source": doc["source"],

`

``

78

`+

"split": doc["split"],

`

``

79

`+

"category": doc["category"],

`

``

80

`+

"L2-category": doc["l2-category"],

`

``

81

`+

},

`

``

82

`+

"submission": {

`

``

83

`+

"index": doc["index"],

`

``

84

`+

"question": doc["question"],

`

``

85

`+

"answer": doc["answer"],

`

``

86

`+

"prediction": model_response,

`

``

87

`+

"hint": doc["hint"],

`

``

88

`+

"source": doc["source"],

`

``

89

`+

"split": doc["split"],

`

``

90

`+

"category": doc["category"],

`

``

91

`+

"L2-category": doc["l2-category"],

`

``

92

`+

},

`

``

93

`+

}

`

``

94

`+

option_candidate = ["A", "B", "C", "D", "E"]

`

``

95

`+

for c in option_candidate:

`

``

96

`+

data["submission"][c] = doc.get(c, "nan")

`

``

97

`+

data["gpt_eval_score"][c] = doc.get(c, "nan")

`

``

98

`+

return data

`

``

99

+

``

100

+

``

101

`+

def mmbench_aggregate_dev_results_eval(results, args):

`

``

102

`+

print(f"============= MMBench-RU(Dev) Detailed Results =============")

`

``

103

`+

overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")

`

``

104

`+

file = generate_submission_file("mmbench_ru_dev_results.json", args)

`

``

105

`+

details_info = {

`

``

106

`+

"overall_acc": overall_acc,

`

``

107

`+

"category_acc": category_acc,

`

``

108

`+

"l2_category_acc": l2_category_acc,

`

``

109

`+

}

`

``

110

`+

with open(file, "w") as f:

`

``

111

`+

json.dump(details_info, f)

`

``

112

`+

return overall_acc * 100

`

``

113

+

``

114

+

``

115

`+

def mmbench_aggregate_dev_results_submission(results, args):

`

``

116

`+

df = pd.DataFrame(results)

`

``

117

`+

excel_write_path = generate_submission_file("mmbench_ru_dev_results.xlsx", args)

`

``

118

`+

with pd.ExcelWriter(excel_write_path) as writer:

`

``

119

`+

df.to_excel(writer, index=False)

`

``

120

`+

eval_logger.info(f"Saved results to {excel_write_path}")

`

``

121

+

``

122

+

``

123

`+

def mmbench_aggregate_test_results(results, args):

`

``

124

`+

df = pd.DataFrame(results)

`

``

125

`+

excel_write_path = generate_submission_file("mmbench_ru_test_results.xlsx", args)

`

``

126

`+

with pd.ExcelWriter(excel_write_path) as writer:

`

``

127

`+

df.to_excel(writer, index=False)

`

``

128

`+

eval_logger.info(f"Saved results to {excel_write_path}")

`