Add Muirbench (#143) · EvolvingLMMs-Lab/lmms-eval@5fc5f2f (original) (raw)

``

1

+

``

2

`+

from lmms_eval.filters.extraction import ExtendedRegexFilter

`

``

3

`+

from lmms_eval.filters.transformation import MapFilter

`

``

4

`+

import re

`

``

5

`+

import pandas as pd

`

``

6

+

``

7

+

``

8

`+

def muir_doc_to_text(doc, model_specific_prompt_kwargs=None):

`

``

9

`+

question, choices = doc["question"], doc["options"]

`

``

10

`+

len_choices = len(choices)

`

``

11

`+

post_prompt = model_specific_prompt_kwargs["post_prompt"]

`

``

12

`+

pre_prompt = model_specific_prompt_kwargs["pre_prompt"]

`

``

13

`+

options = [chr(ord("A") + i) for i in range(len_choices)]

`

``

14

`+

choices_str = "\n".join([f"{option}. {choice}" for option, choice in zip(options, choices)])

`

``

15

`+

return f"{pre_prompt}{question}\n{choices_str}{post_prompt}"

`

``

16

+

``

17

+

``

18

`+

def muir_doc_to_visual(doc):

`

``

19

`+

image_list = [image.convert("RGB") for image in doc["image_list"]]

`

``

20

`+

return image_list

`

``

21

+

``

22

+

``

23

`+

def muir_doc_to_target(doc):

`

``

24

`+

return doc["answer"]

`

``

25

+

``

26

+

``

27

`+

def muir_process_results(doc, result):

`

``

28

`+

pred = result[0]

`

``

29

`+

task = doc["task"]

`

``

30

`+

idx = doc["idx"]

`

``

31

`+

image_relation = doc["image_relation"]

`

``

32

`+

answer = doc["answer"]

`

``

33

`+

image_type = doc["image_type"]

`

``

34

+

``

35

`+

data_dict = {

`

``

36

`+

"pred" : pred,

`

``

37

`+

"task" : task,

`

``

38

`+

"idx" : idx,

`

``

39

`+

"image_relation" : image_relation,

`

``

40

`+

"answer" : answer,

`

``

41

`+

"image_type" : image_type,

`

``

42

`+

}

`

``

43

+

``

44

`+

return {"muirbench_score_overall" : data_dict}

`

``

45

+

``

46

+

``

47

`+

def muir_aggregation(results):

`

``

48

`+

task_num = {}

`

``

49

`+

score = 0

`

``

50

`+

task_score = {}

`

``

51

`+

for result in results:

`

``

52

`+

if result["task"] not in task_score:

`

``

53

`+

task_score[result["task"]] = 0

`

``

54

+

``

55

`+

if result["task"] not in task_num:

`

``

56

`+

task_num[result["task"]] = 0

`

``

57

+

``

58

`+

if result["pred"].lower().strip() == result["answer"].lower().strip():

`

``

59

`+

task_score[result["task"]] += 1

`

``

60

`+

score += 1

`

``

61

`+

task_num[result["task"]] += 1

`

``

62

+

``

63

`+

score = score / len(results)

`

``

64

+

``

65

`+

task_score = {k : v / task_num[k] for k,v in task_score.items()}

`

``

66

+

``

67

`+

print("=" * 50)

`

``

68

`+

for k, v in task_score.items():

`

``

69

`+

print(f"{k} : {v:.2f}")

`

``

70

`+

print("=" * 50)

`

``

71

+

``

72

`+

return score

`

``

73

+

``

74

+

``

75

+

``

76

+

``

77

`+

class MultiChoiceRegexFilter(ExtendedRegexFilter):

`

``

78

`+

def init(self, *args, **kwargs):

`

``

79

`+

"""

`

``

80

`+

regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure

`

``

81

`+

`

``

82

`+

`

``

83

`+

group_select: Selects the (group_select)th match from the findall result.

`

``

84

`+

ignore_case: Ignores the case during step 1 matching

`

``

85

`+

ignore_punctuation: Remove the punctuation during step 1 matching

`

``

86

`+

regexes_to_ignore: Remove these regexes during step 1 matching

`

``

87

`+

"""

`

``

88

`+

super().init(*args, **kwargs)

`

``

89

+

``

90

`+

def apply(self, resps, docs):

`

``

91

`+

here, we assume we have a list, in which each element is

`

``

92

`+

a list of model responses for some particular input/target pair.

`

``

93

`+

so we process each of these (same input/target response sets)

`

``

94

`+

independently (and keep them a list.)

`

``

95

+

``

96

`+

filtered_resps = []

`

``

97

+

``

98

`+

for r, doc in zip(resps, docs):

`

``

99

`+

Regex to directly extract the option letter from the model response

`

``

100

`+

option_letter_regex = re.compile(r"^\s*([A-Z]).")

`

``

101

+

``

102

`+

Process each response

`

``

103

`+

filtered = []

`

``

104

`+

for resp in r:

`

``

105

`+

Try to match the option letter at the start of the response

`

``

106

`+

match = option_letter_regex.match(resp)

`

``

107

`+

if match:

`

``

108

`+

If a match is found, append the matched letter

`

``

109

`+

filtered.append(match.group(1))

`

``

110

`+

else:

`

``

111

`+

If no match, return the original response

`

``

112

`+

filtered.append(resp)

`

``

113

+

``

114

`+

Assuming we need the first response that matches or the original response

`

``

115

`+

filtered_resps.append(filtered[0])

`

``

116

+

``

117

`+

return filtered_resps

`