switch logic · EvolvingLMMs-Lab/lmms-eval@043b483 (original) (raw)

`@@ -6,6 +6,9 @@

`

6

6

`from spacy.cli import download

`

7

7

`from nltk.util import ngrams

`

8

8

`from functools import partial

`

``

9

`+

import datetime

`

``

10

`+

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

`

``

11

`+

import json

`

9

12

``

10

13

`# Download the English and Chinese models

`

11

14

`download("en_core_web_sm")

`

`@@ -46,7 +49,7 @@ def fast_filter(answer_text):

`

46

49

``

47

50

``

48

51

`def vcr_doc_to_visual(doc):

`

49

``

`-

return [doc["stacked_image"].convert("RGB"), doc["only_it_image"].convert("RGB")]

`

``

52

`+

return [doc["stacked_image"].convert("RGB")]

`

50

53

``

51

54

``

52

55

`def vcr_doc_to_text(doc, model_specific_prompt_kwargs=None):

`

`@@ -80,7 +83,7 @@ def vcr_process_results_single(crossed_text, result, language):

`

80

83

` doc: a instance of the eval dataset

`

81

84

` results: [pred]

`

82

85

` Returns:

`

83

``

`-

a dictionary with key: metric name (in this case mme score), value: metric value

`

``

86

`+

a dictionary with key: metric name (in this case vcr score), value: metric value

`

84

87

` """

`

85

88

``

86

89

`assert language in ["en", "zh"], f"Language {language} is not supported."

`

`@@ -171,29 +174,28 @@ def vcr_en_process_results(doc, results):

`

171

174

` doc: a instance of the eval dataset

`

172

175

` results: [pred]

`

173

176

` Returns:

`

174

``

`-

a dictionary with key: metric name (in this case mme score), value: metric value

`

``

177

`+

a dictionary with key: metric name (in this case vcr score), value: metric value

`

175

178

` """

`

176

``

`-

assert len(results) == 2, f"Expected 2 results, got {len(results)}"

`

177

``

`-

output = {}

`

178

``

`-

for i in range(len(doc["crossed_text"])):

`

179

``

`-

res_stacked_image_results = vcr_process_results_single(

`

180

``

`-

doc["crossed_text"][i], results[0], "en"

`

181

``

`-

)

`

182

``

`-

res_only_image_results = vcr_process_results_single(

`

183

``

`-

doc["crossed_text"][i], results[1], "en"

`

184

``

`-

)

`

185

``

`-

output.update(

`

186

``

`-

{

`

187

``

`-

f"res_stacked_image__{k}___{i}": v

`

188

``

`-

for k, v in res_stacked_image_results.items()

`

189

``

`-

}

`

190

``

`-

)

`

191

``

`-

output.update(

`

192

``

`-

{

`

193

``

`-

f"res_only_it_image__{k}___{i}": v

`

194

``

`-

for k, v in res_only_image_results.items()

`

195

``

`-

}

`

196

``

`-

)

`

``

179

`+

output = {

`

``

180

`+

"max_sim_val": [],

`

``

181

`+

"precision": [],

`

``

182

`+

"recall": [],

`

``

183

`+

"f1": [],

`

``

184

`+

"jaccard": [],

`

``

185

`+

"rouge1": [],

`

``

186

`+

"exact_match": [],

`

``

187

`+

}

`

``

188

`+

crossed_text = doc["crossed_text"]

`

``

189

`+

for i in range(len(crossed_text)):

`

``

190

`+

tmp = vcr_process_results_single(crossed_text[i], results, "en")

`

``

191

`+

for k in output.keys():

`

``

192

`+

output[k].append(

`

``

193

`+

{

`

``

194

`+

"score": tmp[k],

`

``

195

`+

"max_sim_string": tmp["max_sim_string"],

`

``

196

`+

"caption": doc["caption"],

`

``

197

`+

}

`

``

198

`+

)

`

197

199

`return output

`

198

200

``

199

201

``

`@@ -203,62 +205,51 @@ def vcr_zh_process_results(doc, results):

`

203

205

` doc: a instance of the eval dataset

`

204

206

` results: [pred]

`

205

207

` Returns:

`

206

``

`-

a dictionary with key: metric name (in this case mme score), value: metric value

`

``

208

`+

a dictionary with key: metric name (in this case vcr score), value: metric value

`

207

209

` """

`

208

``

`-

assert len(results) == 2, f"Expected 2 results, got {len(results)}"

`

209

``

`-

output = {}

`

210

``

`-

for i in range(len(doc["crossed_text"])):

`

211

``

`-

res_stacked_image_results = vcr_process_results_single(

`

212

``

`-

doc["crossed_text"][i], results[0], "zh"

`

213

``

`-

)

`

214

``

`-

res_only_image_results = vcr_process_results_single(

`

215

``

`-

doc["crossed_text"][i], results[1], "zh"

`

216

``

`-

)

`

217

``

`-

output.update(

`

218

``

`-

{

`

219

``

`-

f"res_stacked_image__{k}___{i}": v

`

220

``

`-

for k, v in res_stacked_image_results.items()

`

221

``

`-

}

`

222

``

`-

)

`

223

``

`-

output.update(

`

224

``

`-

{

`

225

``

`-

f"res_only_it_image__{k}___{i}": v

`

226

``

`-

for k, v in res_only_image_results.items()

`

227

``

`-

}

`

228

``

`-

)

`

``

210

`+

output = {

`

``

211

`+

"max_sim_val": [],

`

``

212

`+

"precision": [],

`

``

213

`+

"recall": [],

`

``

214

`+

"f1": [],

`

``

215

`+

"jaccard": [],

`

``

216

`+

"rouge1": [],

`

``

217

`+

"exact_match": [],

`

``

218

`+

}

`

``

219

`+

crossed_text = doc["crossed_text"]

`

``

220

`+

for i in range(len(crossed_text)):

`

``

221

`+

tmp = vcr_process_results_single(crossed_text[i], results, "zh")

`

``

222

`+

for k in output.keys():

`

``

223

`+

output[k].append(

`

``

224

`+

{

`

``

225

`+

"score": tmp[k],

`

``

226

`+

"max_sim_string": tmp["max_sim_string"],

`

``

227

`+

"caption": doc["caption"],

`

``

228

`+

}

`

``

229

`+

)

`

229

230

`return output

`

230

231

``

231

232

``

232

``

`-

def vcr_aggregate_results(results):

`

``

233

`+

def vcr_aggregate_results(results, args):

`

233

234

`"""

`

234

235

` Args:

`

235

236

` results: a list of values returned by process_results

`

236

237

` Returns:

`

237

238

` A dictionary of dictionary of float, where the outer dictionary has keys "res_stacked_image" and "res_only_it_image"

`

238

239

` """

`

239

``

`-

output = {

`

240

``

`-

"res_stacked_image__precision": 0,

`

241

``

`-

"res_stacked_image__recall": 0,

`

242

``

`-

"res_stacked_image__f1": 0,

`

243

``

`-

"res_stacked_image__jaccard": 0,

`

244

``

`-

"res_stacked_image__rouge1": 0,

`

245

``

`-

"res_stacked_image__exact_match": 0,

`

246

``

`-

"res_only_it_image__precision": 0,

`

247

``

`-

"res_only_it_image__recall": 0,

`

248

``

`-

"res_only_it_image__f1": 0,

`

249

``

`-

"res_only_it_image__jaccard": 0,

`

250

``

`-

"res_only_it_image__rouge1": 0,

`

251

``

`-

"res_only_it_image__exact_match": 0,

`

252

``

`-

}

`

253

``

-

254

``

`-

for output_key in output.keys():

`

255

``

`-

count = 0

`

256

``

`-

query_domain, query_metric_name = output_key.split("__")

`

257

``

`-

for inner_dict in results:

`

258

``

`-

for inner_key, inner_value in inner_dict.items():

`

259

``

`-

key_domain, key_metric_name, _ = inner_key.split("__")

`

260

``

`-

if query_domain == key_domain and query_metric_name == key_metric_name:

`

261

``

`-

output[output_key] += inner_value

`

262

``

`-

count += 1

`

263

``

`-

output[output_key] /= count

`

264

``

`-

return output

`

``

240

`+

scores = 0

`

``

241

`+

count = 0

`

``

242

`+

output_dict = {}

`

``

243

`+

for i in range(len(results)):

`

``

244

`+

for blank_id in range(len(results[i])):

`

``

245

`+

scores += results[i][blank_id]["score"]

`

``

246

`+

count += 1

`

``

247

`+

output_dict[str(i)] = results[i]

`

``

248

+

``

249

`+

now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

`

``

250

`+

path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)

`

``

251

`+

with open(path, "w") as f:

`

``

252

`+

json.dump(output_dict, f)

`

``

253

`+

print(f"Submission file saved to {path}")

`

``

254

`+

eval_logger.info(f"Submission file saved to {path}")

`

``

255

`+

return scores / count

`