Create ScreenSpot on clean branch · EvolvingLMMs-Lab/lmms-eval@e457cfb (original) (raw)

``

1

`+

from PIL import ImageDraw

`

``

2

`+

from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice

`

``

3

`+

from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer

`

``

4

`+

from pycocotools.coco import COCO

`

``

5

+

``

6

`+

COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]

`

``

7

`+

COCO_METRICS = ["CIDEr"]

`

``

8

+

``

9

`+

import logging

`

``

10

+

``

11

`+

eval_logger = logging.getLogger("lmms-eval")

`

``

12

+

``

13

+

``

14

`+

def screenspot_bbox_doc_to_visual(doc):

`

``

15

`+

bbox = doc["bbox"]

`

``

16

`+

image = doc["image"].convert("RGB")

`

``

17

`+

draw = ImageDraw.Draw(image)

`

``

18

`+

bbox_xy = [bbox[0], bbox[1], bbox[2], bbox[3]]

`

``

19

`+

draw.rectangle(bbox_xy, outline="red", width=3)

`

``

20

`+

return [image.convert("RGB")]

`

``

21

+

``

22

+

``

23

`+

def screenspot_process_result(doc, result):

`

``

24

`+

"""

`

``

25

`+

Args:

`

``

26

`+

doc: a instance of the eval dataset

`

``

27

`+

results: [pred]

`

``

28

`+

Returns:

`

``

29

`+

a dictionary with key: metric name (in this case coco_bleu), value: metric value

`

``

30

`+

"""

`

``

31

`+

pred = result[0] if len(result) > 0 else ""

`

``

32

`+

ann_id = doc["file_name"]

`

``

33

`+

data_dict = {"instruction": doc["instruction"], "pred": pred, "ann_id": ann_id, 'data_type': doc['data_type'], 'data_source': doc['data_source']}

`

``

34

`+

return {f"screenspot_{metric}": data_dict for metric in COCO_METRICS}

`

``

35

+

``

36

+

``

37

`+

def screenspot_doc_to_text(doc):

`

``

38

`+

return f"Direct a user to interact with the highlighted region [{doc['bbox'][0]:.2f}, {doc['bbox'][1]:.2f}, {doc['bbox'][2]:.2f}, {doc['bbox'][3]:.2f}]."

`

``

39

+

``

40

+

``

41

`+

def screenspot_aggregation_result(results, metric):

`

``

42

`+

scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]

`

``

43

`+

scorers = [(Cider(), "CIDEr")]

`

``

44

`+

scorers_dict = {s[1]: s for s in scorers}

`

``

45

+

``

46

`+

stored_results = []

`

``

47

`+

In order to make the coco eval tools to successfully create index

`

``

48

`+

We need at least two dict in the dataset

`

``

49

`+

'annotation' and 'images'

`

``

50

`+

'annotation' exactly reproduce the original annotation

`

``

51

`+

'images' however only need the image id which is contained in the file name

`

``

52

`+

dataset = {"annotations": [], "images": []}

`

``

53

`+

idx = 0

`

``

54

`+

ann_id = 0

`

``

55

`+

for result in results:

`

``

56

`+

stored_results.append({"image_id": idx, "caption": result["pred"]})

`

``

57

`+

for s in result["answer"]:

`

``

58

`+

dataset["annotations"].append({"image_id": idx, "caption": result['instruction'], "id": ann_id})

`

``

59

`+

ann_id += 1

`

``

60

+

``

61

`+

dataset["images"].append({"id": idx})

`

``

62

`+

idx += 1

`

``

63

+

``

64

`+

coco = COCO()

`

``

65

`+

Manually create index here

`

``

66

`+

coco.dataset = dataset

`

``

67

`+

coco.createIndex()

`

``

68

+

``

69

`+

coco_result = coco.loadRes(stored_results)

`

``

70

`+

coco_eval = COCOEvalCap(coco, coco_result)

`

``

71

+

``

72

`+

imgIds = coco_eval.params["image_id"]

`

``

73

`+

gts = {}

`

``

74

`+

res = {}

`

``

75

`+

for imgId in imgIds:

`

``

76

`+

gts[imgId] = coco_eval.coco.imgToAnns[imgId]

`

``

77

`+

res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]

`

``

78

+

``

79

`+

eval_logger.info("tokenization...")

`

``

80

`+

tokenizer = PTBTokenizer()

`

``

81

`+

gts = tokenizer.tokenize(gts)

`

``

82

`+

res = tokenizer.tokenize(res)

`

``

83

+

``

84

`+

eval_logger.info(f"Computing {metric} scores...")

`

``

85

+

``

86

`+

score, scores = scorers_dict[metric][0].compute_score(gts, res)

`

``

87

`+

coco_eval.setEval(score, metric)

`

``

88

+

``

89

`+

When metric is one of the Bleu, score will be a list

`

``

90

`+

if type(score) == list:

`

``

91

`+

n = int(metric.split("_")[-1])

`

``

92

`+

score = score[n - 1]

`

``

93

+

``

94

`+

return score

`

``

95

+

``

96

+

``

97

`+

def screenspot_bleu4(results):

`

``

98

`+

return screenspot_aggregation_result(results, "Bleu_4")

`

``

99

+

``

100

+

``

101

`+

def screenspot_bleu3(results):

`

``

102

`+

return screenspot_aggregation_result(results, "Bleu_3")

`

``

103

+

``

104

+

``

105

`+

def screenspot_bleu2(results):

`

``

106

`+

return screenspot_aggregation_result(results, "Bleu_2")

`

``

107

+

``

108

+

``

109

`+

def screenspot_bleu1(results):

`

``

110

`+

return screenspot_aggregation_result(results, "Bleu_1")

`

``

111

+

``

112

+

``

113

`+

def screenspot_meteor(results):

`

``

114

`+

return screenspot_aggregation_result(results, "METEOR")

`

``

115

+

``

116

+

``

117

`+

def screenspot_rougel(results):

`

``

118

`+

return screenspot_aggregation_result(results, "ROUGE_L")

`

``

119

+

``

120

+

``

121

`+

def screenspot_cider(results):

`

``

122

`+

return screenspot_aggregation_result(results, "CIDEr")

`

``

123

+

``

124

+

``

125

`+

def screenspot_spice(results):

`

``

126

`+

return screenspot_aggregation_result(results, "SPICE")

`