Create ScreenSpot on clean branch · EvolvingLMMs-Lab/lmms-eval@e457cfb (original) (raw)

1

from PIL import ImageDraw

2

from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice

3

from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

4

from pycocotools.coco import COCO

5

+

6

COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]

7

COCO_METRICS = ["CIDEr"]

8

+

9

import logging

10

+

11

eval_logger = logging.getLogger("lmms-eval")

12

+

13

+

14

def screenspot_bbox_doc_to_visual(doc):

15

bbox = doc["bbox"]

16

image = doc["image"].convert("RGB")

17

draw = ImageDraw.Draw(image)

18

bbox_xy = [bbox[0], bbox[1], bbox[2], bbox[3]]

19

draw.rectangle(bbox_xy, outline="red", width=3)

20

return [image.convert("RGB")]

21

+

22

+

23

def screenspot_process_result(doc, result):

24

"""

25

Args:

26

doc: a instance of the eval dataset

27

results: [pred]

28

Returns:

29

a dictionary with key: metric name (in this case coco_bleu), value: metric value

30

"""

31

pred = result[0] if len(result) > 0 else ""

32

ann_id = doc["file_name"]

33

data_dict = {"instruction": doc["instruction"], "pred": pred, "ann_id": ann_id, 'data_type': doc['data_type'], 'data_source': doc['data_source']}

34

return {f"screenspot_{metric}": data_dict for metric in COCO_METRICS}

35

+

36

+

37

def screenspot_doc_to_text(doc):

38

return f"Direct a user to interact with the highlighted region [{doc['bbox'][0]:.2f}, {doc['bbox'][1]:.2f}, {doc['bbox'][2]:.2f}, {doc['bbox'][3]:.2f}]."

39

+

40

+

41

def screenspot_aggregation_result(results, metric):

42

scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]

43

scorers = [(Cider(), "CIDEr")]

44

scorers_dict = {s[1]: s for s in scorers}

45

+

46

stored_results = []

47

In order to make the coco eval tools to successfully create index

48

We need at least two dict in the dataset

49

'annotation' and 'images'

50

'annotation' exactly reproduce the original annotation

51

'images' however only need the image id which is contained in the file name

52

dataset = {"annotations": [], "images": []}

53

idx = 0

54

ann_id = 0

55

for result in results:

56

stored_results.append({"image_id": idx, "caption": result["pred"]})

57

for s in result["answer"]:

58

dataset["annotations"].append({"image_id": idx, "caption": result['instruction'], "id": ann_id})

59

ann_id += 1

60

+

61

dataset["images"].append({"id": idx})

62

idx += 1

63

+

64

coco = COCO()

65

Manually create index here

66

coco.dataset = dataset

67

coco.createIndex()

68

+

69

coco_result = coco.loadRes(stored_results)

70

coco_eval = COCOEvalCap(coco, coco_result)

71

+

72

imgIds = coco_eval.params["image_id"]

73

gts = {}

74

res = {}

75

for imgId in imgIds:

76

gts[imgId] = coco_eval.coco.imgToAnns[imgId]

77

res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]

78

+

79

eval_logger.info("tokenization...")

80

tokenizer = PTBTokenizer()

81

gts = tokenizer.tokenize(gts)

82

res = tokenizer.tokenize(res)

83

+

84

eval_logger.info(f"Computing {metric} scores...")

85

+

86

score, scores = scorers_dict[metric][0].compute_score(gts, res)

87

coco_eval.setEval(score, metric)

88

+

89

When metric is one of the Bleu, score will be a list

90

if type(score) == list:

91

n = int(metric.split("_")[-1])

92

score = score[n - 1]

93

+

94

return score

95

+

96

+

97

def screenspot_bleu4(results):

98

return screenspot_aggregation_result(results, "Bleu_4")

99

+

100

+

101

def screenspot_bleu3(results):

102

return screenspot_aggregation_result(results, "Bleu_3")

103

+

104

+

105

def screenspot_bleu2(results):

106

return screenspot_aggregation_result(results, "Bleu_2")

107

+

108

+

109

def screenspot_bleu1(results):

110

return screenspot_aggregation_result(results, "Bleu_1")

111

+

112

+

113

def screenspot_meteor(results):

114

return screenspot_aggregation_result(results, "METEOR")

115

+

116

+

117

def screenspot_rougel(results):

118

return screenspot_aggregation_result(results, "ROUGE_L")

119

+

120

+

121

def screenspot_cider(results):

122

return screenspot_aggregation_result(results, "CIDEr")

123

+

124

+

125

def screenspot_spice(results):

126

return screenspot_aggregation_result(results, "SPICE")