Create ScreenSpot on clean branch · EvolvingLMMs-Lab/lmms-eval@e457cfb (original) (raw)
``
1
`+
from PIL import ImageDraw
`
``
2
`+
from pycocoevalcap.eval import COCOEvalCap, Bleu, Meteor, Rouge, Cider, Spice
`
``
3
`+
from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
`
``
4
`+
from pycocotools.coco import COCO
`
``
5
+
``
6
`+
COCO_METRICS = ["Bleu_4", "Bleu_3", "Bleu_2", "Bleu_1", "METEOR", "ROUGE_L", "CIDEr"] # , "SPICE"]
`
``
7
`+
COCO_METRICS = ["CIDEr"]
`
``
8
+
``
9
`+
import logging
`
``
10
+
``
11
`+
eval_logger = logging.getLogger("lmms-eval")
`
``
12
+
``
13
+
``
14
`+
def screenspot_bbox_doc_to_visual(doc):
`
``
15
`+
bbox = doc["bbox"]
`
``
16
`+
image = doc["image"].convert("RGB")
`
``
17
`+
draw = ImageDraw.Draw(image)
`
``
18
`+
bbox_xy = [bbox[0], bbox[1], bbox[2], bbox[3]]
`
``
19
`+
draw.rectangle(bbox_xy, outline="red", width=3)
`
``
20
`+
return [image.convert("RGB")]
`
``
21
+
``
22
+
``
23
`+
def screenspot_process_result(doc, result):
`
``
24
`+
"""
`
``
25
`+
Args:
`
``
26
`+
doc: a instance of the eval dataset
`
``
27
`+
results: [pred]
`
``
28
`+
Returns:
`
``
29
`+
a dictionary with key: metric name (in this case coco_bleu), value: metric value
`
``
30
`+
"""
`
``
31
`+
pred = result[0] if len(result) > 0 else ""
`
``
32
`+
ann_id = doc["file_name"]
`
``
33
`+
data_dict = {"instruction": doc["instruction"], "pred": pred, "ann_id": ann_id, 'data_type': doc['data_type'], 'data_source': doc['data_source']}
`
``
34
`+
return {f"screenspot_{metric}": data_dict for metric in COCO_METRICS}
`
``
35
+
``
36
+
``
37
`+
def screenspot_doc_to_text(doc):
`
``
38
`+
return f"Direct a user to interact with the highlighted region [{doc['bbox'][0]:.2f}, {doc['bbox'][1]:.2f}, {doc['bbox'][2]:.2f}, {doc['bbox'][3]:.2f}]."
`
``
39
+
``
40
+
``
41
`+
def screenspot_aggregation_result(results, metric):
`
``
42
`+
scorers = [(Bleu(4), "Bleu_1"), (Bleu(4), "Bleu_2"), (Bleu(4), "Bleu_3"), (Bleu(4), "Bleu_4"), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr"), (Spice(), "SPICE")]
`
``
43
`+
scorers = [(Cider(), "CIDEr")]
`
``
44
`+
scorers_dict = {s[1]: s for s in scorers}
`
``
45
+
``
46
`+
stored_results = []
`
``
47
`+
In order to make the coco eval tools to successfully create index
`
``
48
`+
We need at least two dict in the dataset
`
``
49
`+
'annotation' and 'images'
`
``
50
`+
'annotation' exactly reproduce the original annotation
`
``
51
`+
'images' however only need the image id which is contained in the file name
`
``
52
`+
dataset = {"annotations": [], "images": []}
`
``
53
`+
idx = 0
`
``
54
`+
ann_id = 0
`
``
55
`+
for result in results:
`
``
56
`+
stored_results.append({"image_id": idx, "caption": result["pred"]})
`
``
57
`+
for s in result["answer"]:
`
``
58
`+
dataset["annotations"].append({"image_id": idx, "caption": result['instruction'], "id": ann_id})
`
``
59
`+
ann_id += 1
`
``
60
+
``
61
`+
dataset["images"].append({"id": idx})
`
``
62
`+
idx += 1
`
``
63
+
``
64
`+
coco = COCO()
`
``
65
`+
Manually create index here
`
``
66
`+
coco.dataset = dataset
`
``
67
`+
coco.createIndex()
`
``
68
+
``
69
`+
coco_result = coco.loadRes(stored_results)
`
``
70
`+
coco_eval = COCOEvalCap(coco, coco_result)
`
``
71
+
``
72
`+
imgIds = coco_eval.params["image_id"]
`
``
73
`+
gts = {}
`
``
74
`+
res = {}
`
``
75
`+
for imgId in imgIds:
`
``
76
`+
gts[imgId] = coco_eval.coco.imgToAnns[imgId]
`
``
77
`+
res[imgId] = coco_eval.cocoRes.imgToAnns[imgId]
`
``
78
+
``
79
`+
eval_logger.info("tokenization...")
`
``
80
`+
tokenizer = PTBTokenizer()
`
``
81
`+
gts = tokenizer.tokenize(gts)
`
``
82
`+
res = tokenizer.tokenize(res)
`
``
83
+
``
84
`+
eval_logger.info(f"Computing {metric} scores...")
`
``
85
+
``
86
`+
score, scores = scorers_dict[metric][0].compute_score(gts, res)
`
``
87
`+
coco_eval.setEval(score, metric)
`
``
88
+
``
89
`+
When metric is one of the Bleu, score will be a list
`
``
90
`+
if type(score) == list:
`
``
91
`+
n = int(metric.split("_")[-1])
`
``
92
`+
score = score[n - 1]
`
``
93
+
``
94
`+
return score
`
``
95
+
``
96
+
``
97
`+
def screenspot_bleu4(results):
`
``
98
`+
return screenspot_aggregation_result(results, "Bleu_4")
`
``
99
+
``
100
+
``
101
`+
def screenspot_bleu3(results):
`
``
102
`+
return screenspot_aggregation_result(results, "Bleu_3")
`
``
103
+
``
104
+
``
105
`+
def screenspot_bleu2(results):
`
``
106
`+
return screenspot_aggregation_result(results, "Bleu_2")
`
``
107
+
``
108
+
``
109
`+
def screenspot_bleu1(results):
`
``
110
`+
return screenspot_aggregation_result(results, "Bleu_1")
`
``
111
+
``
112
+
``
113
`+
def screenspot_meteor(results):
`
``
114
`+
return screenspot_aggregation_result(results, "METEOR")
`
``
115
+
``
116
+
``
117
`+
def screenspot_rougel(results):
`
``
118
`+
return screenspot_aggregation_result(results, "ROUGE_L")
`
``
119
+
``
120
+
``
121
`+
def screenspot_cider(results):
`
``
122
`+
return screenspot_aggregation_result(results, "CIDEr")
`
``
123
+
``
124
+
``
125
`+
def screenspot_spice(results):
`
``
126
`+
return screenspot_aggregation_result(results, "SPICE")
`