modify the form of VCR · EvolvingLMMs-Lab/lmms-eval@e1f04db (original) (raw)

1

``

`-

from collections import defaultdict

`

2

1

`import os

`

3

2

`from difflib import SequenceMatcher as SM

`

4

``

`-

import datetime

`

5

``

`-

import json

`

6

``

`-

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

`

7

3

`import evaluate

`

8

4

`import logging

`

9

5

`import spacy

`

34

30

`}

`

35

31

``

36

32

``

``

33

`+

def fast_filter(answer_text):

`

``

34

`+

if "I can't" in answer_text:

`

``

35

`+

return True

`

``

36

`+

elif "I cannot" in answer_text:

`

``

37

`+

return True

`

``

38

`+

elif "sorry" in answer_text.lower():

`

``

39

`+

return True

`

``

40

`+

if "无法" in answer_text:

`

``

41

`+

return True

`

``

42

`+

elif "抱歉" in answer_text:

`

``

43

`+

return True

`

``

44

`+

else:

`

``

45

`+

return False

`

``

46

+

``

47

+

37

48

`def vcr_doc_to_visual(doc):

`

38

49

`return [doc["stacked_image"].convert("RGB"), doc["only_it_image"].convert("RGB")]

`

39

50

``

`@@ -63,16 +74,29 @@ def tokenize(text, language):

`

63

74

`return [token.text for token in processed_text]

`

64

75

``

65

76

``

66

``

`-

def vcr_process_results_single(doc, result, language):

`

``

77

`+

def vcr_process_results_single(crossed_text, result, language):

`

67

78

`"""

`

68

79

` Args:

`

69

80

` doc: a instance of the eval dataset

`

70

81

` results: [pred]

`

71

82

` Returns:

`

72

83

` a dictionary with key: metric name (in this case mme score), value: metric value

`

73

84

` """

`

``

85

+

74

86

`assert language in ["en", "zh"], f"Language {language} is not supported."

`

75

``

`-

crossed_text = doc["crossed_text"]

`

``

87

+

``

88

`+

if fast_filter(result):

`

``

89

`+

return {

`

``

90

`+

"crossed_text": crossed_text,

`

``

91

`+

"max_sim_val": 0,

`

``

92

`+

"max_sim_string": "",

`

``

93

`+

"precision": 0,

`

``

94

`+

"recall": 0,

`

``

95

`+

"f1": 0,

`

``

96

`+

"jaccard": 0,

`

``

97

`+

"rouge1": 0,

`

``

98

`+

"exact_match": 0,

`

``

99

`+

}

`

76

100

`tokens_result = tokenize(result, language)

`

77

101

`tokens_crossed_text = tokenize(crossed_text, language)

`

78

102

``

`@@ -150,10 +174,26 @@ def vcr_en_process_results(doc, results):

`

150

174

` a dictionary with key: metric name (in this case mme score), value: metric value

`

151

175

` """

`

152

176

`assert len(results) == 2, f"Expected 2 results, got {len(results)}"

`

153

``

`-

output = {

`

154

``

`-

"res_stacked_image": vcr_process_results_single(doc, results[0], "en"),

`

155

``

`-

"res_only_it_image": vcr_process_results_single(doc, results[1], "en"),

`

156

``

`-

}

`

``

177

`+

output = {}

`

``

178

`+

for i in range(len(doc["crossed_text"])):

`

``

179

`+

res_stacked_image_results = vcr_process_results_single(

`

``

180

`+

doc["crossed_text"][i], results[0], "en"

`

``

181

`+

)

`

``

182

`+

res_only_image_results = vcr_process_results_single(

`

``

183

`+

doc["crossed_text"][i], results[1], "en"

`

``

184

`+

)

`

``

185

`+

output.update(

`

``

186

`+

{

`

``

187

`+

f"res_stacked_image__{k}___{i}": v

`

``

188

`+

for k, v in res_stacked_image_results.items()

`

``

189

`+

}

`

``

190

`+

)

`

``

191

`+

output.update(

`

``

192

`+

{

`

``

193

`+

f"res_only_it_image__{k}___{i}": v

`

``

194

`+

for k, v in res_only_image_results.items()

`

``

195

`+

}

`

``

196

`+

)

`

157

197

`return output

`

158

198

``

159

199

``

`@@ -166,10 +206,26 @@ def vcr_zh_process_results(doc, results):

`

166

206

` a dictionary with key: metric name (in this case mme score), value: metric value

`

167

207

` """

`

168

208

`assert len(results) == 2, f"Expected 2 results, got {len(results)}"

`

169

``

`-

output = {

`

170

``

`-

"res_stacked_image": vcr_process_results_single(doc, results[0], "zh"),

`

171

``

`-

"res_only_it_image": vcr_process_results_single(doc, results[1], "zh"),

`

172

``

`-

}

`

``

209

`+

output = {}

`

``

210

`+

for i in range(len(doc["crossed_text"])):

`

``

211

`+

res_stacked_image_results = vcr_process_results_single(

`

``

212

`+

doc["crossed_text"][i], results[0], "zh"

`

``

213

`+

)

`

``

214

`+

res_only_image_results = vcr_process_results_single(

`

``

215

`+

doc["crossed_text"][i], results[1], "zh"

`

``

216

`+

)

`

``

217

`+

output.update(

`

``

218

`+

{

`

``

219

`+

f"res_stacked_image__{k}___{i}": v

`

``

220

`+

for k, v in res_stacked_image_results.items()

`

``

221

`+

}

`

``

222

`+

)

`

``

223

`+

output.update(

`

``

224

`+

{

`

``

225

`+

f"res_only_it_image__{k}___{i}": v

`

``

226

`+

for k, v in res_only_image_results.items()

`

``

227

`+

}

`

``

228

`+

)

`

173

229

`return output

`

174

230

``

175

231

``

`@@ -180,36 +236,29 @@ def vcr_aggregate_results(results):

`

180

236

` Returns:

`

181

237

` A dictionary of dictionary of float, where the outer dictionary has keys "res_stacked_image" and "res_only_it_image"

`

182

238

` """

`

183

``

-

184

239

`output = {

`

185

``

`-

"res_stacked_image": {

`

186

``

`-

"max_sim_val": 0,

`

187

``

`-

"precision": 0,

`

188

``

`-

"recall": 0,

`

189

``

`-

"f1": 0,

`

190

``

`-

"jaccard": 0,

`

191

``

`-

"rouge1": 0,

`

192

``

`-

},

`

193

``

`-

"res_only_it_image": {

`

194

``

`-

"max_sim_val": 0,

`

195

``

`-

"precision": 0,

`

196

``

`-

"recall": 0,

`

197

``

`-

"f1": 0,

`

198

``

`-

"jaccard": 0,

`

199

``

`-

"rouge1": 0,

`

200

``

`-

},

`

``

240

`+

"res_stacked_image__precision": 0,

`

``

241

`+

"res_stacked_image__recall": 0,

`

``

242

`+

"res_stacked_image__f1": 0,

`

``

243

`+

"res_stacked_image__jaccard": 0,

`

``

244

`+

"res_stacked_image__rouge1": 0,

`

``

245

`+

"res_stacked_image__exact_match": 0,

`

``

246

`+

"res_only_it_image__precision": 0,

`

``

247

`+

"res_only_it_image__recall": 0,

`

``

248

`+

"res_only_it_image__f1": 0,

`

``

249

`+

"res_only_it_image__jaccard": 0,

`

``

250

`+

"res_only_it_image__rouge1": 0,

`

``

251

`+

"res_only_it_image__exact_match": 0,

`

201

252

` }

`

202

``

`-

for target_domain in output.keys():

`

203

``

`-

for target_metric_name in output[target_domain].keys():

`

204

``

`-

score = 0

`

205

``

`-

count = 0

`

206

``

`-

for inner_dict in results:

`

207

``

`-

for inner_key, inner_value in inner_dict.items():

`

208

``

`-

if inner_key == target_domain:

`

209

``

`-

for blank_id, blank_metrics in inner_value.items():

`

210

``

`-

for metric_name, metric_value in blank_metrics.items():

`

211

``

`-

if metric_name == target_metric_name:

`

212

``

`-

score += metric_value

`

213

``

`-

count += 1

`

214

``

`-

output[target_domain][target_metric_name] = score / count

`

``

253

+

``

254

`+

for output_key in output.keys():

`

``

255

`+

count = 0

`

``

256

`+

query_domain, query_metric_name = output_key.split("__")

`

``

257

`+

for inner_dict in results:

`

``

258

`+

for inner_key, inner_value in inner_dict.items():

`

``

259

`+

key_domain, key_metric_name, _ = inner_key.split("__")

`

``

260

`+

if query_domain == key_domain and query_metric_name == key_metric_name:

`

``

261

`+

output[output_key] += inner_value

`

``

262

`+

count += 1

`

``

263

`+

output[output_key] /= count

`

215

264

`return output

`