Add Muirbench (#143) · EvolvingLMMs-Lab/lmms-eval@5fc5f2f (original) (raw)

1

+

2

from lmms_eval.filters.extraction import ExtendedRegexFilter

3

from lmms_eval.filters.transformation import MapFilter

4

import re

5

import pandas as pd

6

+

7

+

8

def muir_doc_to_text(doc, model_specific_prompt_kwargs=None):

9

question, choices = doc["question"], doc["options"]

10

len_choices = len(choices)

11

post_prompt = model_specific_prompt_kwargs["post_prompt"]

12

pre_prompt = model_specific_prompt_kwargs["pre_prompt"]

13

options = [chr(ord("A") + i) for i in range(len_choices)]

14

choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])

15

return f"{pre_prompt}{question}\n{choices_str}{post_prompt}"

16

+

17

+

18

def muir_doc_to_visual(doc):

19

image_list = [image.convert("RGB") for image in doc["image_list"]]

20

return image_list

21

+

22

+

23

def muir_doc_to_target(doc):

24

return doc["answer"]

25

+

26

+

27

def muir_process_results(doc, result):

28

pred = result[0]

29

task = doc["task"]

30

idx = doc["idx"]

31

image_relation = doc["image_relation"]

32

answer = doc["answer"]

33

image_type = doc["image_type"]

34

+

35

data_dict = {

36

"pred" : pred,

37

"task" : task,

38

"idx" : idx,

39

"image_relation" : image_relation,

40

"answer" : answer,

41

"image_type" : image_type,

42

}

43

+

44

return {"muirbench_score_overall" : data_dict}

45

+

46

+

47

def muir_aggregation(results):

48

task_num = {}

49

score = 0

50

task_score = {}

51

for result in results:

52

if result["task"] not in task_score:

53

task_score[result["task"]] = 0

54

+

55

if result["task"] not in task_num:

56

task_num[result["task"]] = 0

57

+

58

if result["pred"].lower().strip() == result["answer"].lower().strip():

59

task_score[result["task"]] += 1

60

score += 1

61

task_num[result["task"]] += 1

62

+

63

score = score / len(results)

64

+

65

task_score = {k : v / task_num[k] for k,v in task_score.items()}

66

+

67

print("=" * 50)

68

for k, v in task_score.items():

69

print(f"{k} : {v:.2f}")

70

print("=" * 50)

71

+

72

return score

73

+

74

+

75

+

76

+

77

class MultiChoiceRegexFilter(ExtendedRegexFilter):

78

def init(self, *args, **kwargs):

79

"""

80

regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure

81

step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.

82

step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.

83

group_select: Selects the (group_select)th match from the findall result.

84

ignore_case: Ignores the case during step 1 matching

85

ignore_punctuation: Remove the punctuation during step 1 matching

86

regexes_to_ignore: Remove these regexes during step 1 matching

87

"""

88

super().init(*args, **kwargs)

89

+

90

def apply(self, resps, docs):

91

here, we assume we have a list, in which each element is

92

a list of model responses for some particular input/target pair.

93

so we process each of these (same input/target response sets)

94

independently (and keep them a list.)

95

+

96

filtered_resps = []

97

+

98

for r, doc in zip(resps, docs):

99

Regex to directly extract the option letter from the model response

100

option_letter_regex = re.compile(r"^\s*([A-Z]).")

101

+

102

Process each response

103

filtered = []

104

for resp in r:

105

Try to match the option letter at the start of the response

106

match = option_letter_regex.match(resp)

107

if match:

108

If a match is found, append the matched letter

109

filtered.append(match.group(1))

110

else:

111

If no match, return the original response

112

filtered.append(resp)

113

+

114

Assuming we need the first response that matches or the original response

115

filtered_resps.append(filtered[0])

116

+

117

return filtered_resps