switch logic · EvolvingLMMs-Lab/lmms-eval@043b483 (original) (raw)

`@@ -6,6 +6,9 @@

6

`from spacy.cli import download

7

`from nltk.util import ngrams

8

`from functools import partial

9

import datetime

10

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

11

import json

9

12

10

13

`# Download the English and Chinese models

11

14

`download("en_core_web_sm")

`@@ -46,7 +49,7 @@ def fast_filter(answer_text):

46

49

47

50

48

51

`def vcr_doc_to_visual(doc):

49

return [doc["stacked_image"].convert("RGB"), doc["only_it_image"].convert("RGB")]

52

return [doc["stacked_image"].convert("RGB")]

50

53

51

54

52

55

`def vcr_doc_to_text(doc, model_specific_prompt_kwargs=None):

`@@ -80,7 +83,7 @@ def vcr_process_results_single(crossed_text, result, language):

80

83

` doc: a instance of the eval dataset

81

84

` results: [pred]

82

85

` Returns:

83

a dictionary with key: metric name (in this case mme score), value: metric value

86

a dictionary with key: metric name (in this case vcr score), value: metric value

84

87

` """

85

88

86

89

`assert language in ["en", "zh"], f"Language {language} is not supported."

`@@ -171,29 +174,28 @@ def vcr_en_process_results(doc, results):

171

174

` doc: a instance of the eval dataset

172

175

` results: [pred]

173

176

` Returns:

174

a dictionary with key: metric name (in this case mme score), value: metric value

177

a dictionary with key: metric name (in this case vcr score), value: metric value

175

178

` """

176

assert len(results) == 2, f"Expected 2 results, got {len(results)}"

177

output = {}

178

for i in range(len(doc["crossed_text"])):

179

res_stacked_image_results = vcr_process_results_single(

180

doc["crossed_text"][i], results[0], "en"

181

)

182

res_only_image_results = vcr_process_results_single(

183

doc["crossed_text"][i], results[1], "en"

184

)

185

output.update(

186

{

187

f"res_stacked_image__{k}___{i}": v

188

for k, v in res_stacked_image_results.items()

189

}

190

)

191

output.update(

192

{

193

f"res_only_it_image__{k}___{i}": v

194

for k, v in res_only_image_results.items()

195

}

196

)

179

output = {

180

"max_sim_val": [],

181

"precision": [],

182

"recall": [],

183

"f1": [],

184

"jaccard": [],

185

"rouge1": [],

186

"exact_match": [],

187

}

188

crossed_text = doc["crossed_text"]

189

for i in range(len(crossed_text)):

190

tmp = vcr_process_results_single(crossed_text[i], results, "en")

191

for k in output.keys():

192

output[k].append(

193

{

194

"score": tmp[k],

195

"max_sim_string": tmp["max_sim_string"],

196

"caption": doc["caption"],

197

}

198

)

197

199

`return output

198

200

199

201

`@@ -203,62 +205,51 @@ def vcr_zh_process_results(doc, results):

203

205

` doc: a instance of the eval dataset

204

206

` results: [pred]

205

207

` Returns:

206

a dictionary with key: metric name (in this case mme score), value: metric value

208

a dictionary with key: metric name (in this case vcr score), value: metric value

207

209

` """

208

assert len(results) == 2, f"Expected 2 results, got {len(results)}"

209

output = {}

210

for i in range(len(doc["crossed_text"])):

211

res_stacked_image_results = vcr_process_results_single(

212

doc["crossed_text"][i], results[0], "zh"

213

)

214

res_only_image_results = vcr_process_results_single(

215

doc["crossed_text"][i], results[1], "zh"

216

)

217

output.update(

218

{

219

f"res_stacked_image__{k}___{i}": v

220

for k, v in res_stacked_image_results.items()

221

}

222

)

223

output.update(

224

{

225

f"res_only_it_image__{k}___{i}": v

226

for k, v in res_only_image_results.items()

227

}

228

)

210

output = {

211

"max_sim_val": [],

212

"precision": [],

213

"recall": [],

214

"f1": [],

215

"jaccard": [],

216

"rouge1": [],

217

"exact_match": [],

218

}

219

crossed_text = doc["crossed_text"]

220

for i in range(len(crossed_text)):

221

tmp = vcr_process_results_single(crossed_text[i], results, "zh")

222

for k in output.keys():

223

output[k].append(

224

{

225

"score": tmp[k],

226

"max_sim_string": tmp["max_sim_string"],

227

"caption": doc["caption"],

228

}

229

)

229

230

`return output

230

231

232

def vcr_aggregate_results(results):

233

def vcr_aggregate_results(results, args):

233

234

`"""

234

235

` Args:

235

236

` results: a list of values returned by process_results

236

237

` Returns:

237

238

` A dictionary of dictionary of float, where the outer dictionary has keys "res_stacked_image" and "res_only_it_image"

238

239

` """

239

output = {

240

"res_stacked_image__precision": 0,

241

"res_stacked_image__recall": 0,

242

"res_stacked_image__f1": 0,

243

"res_stacked_image__jaccard": 0,

244

"res_stacked_image__rouge1": 0,

245

"res_stacked_image__exact_match": 0,

246

"res_only_it_image__precision": 0,

247

"res_only_it_image__recall": 0,

248

"res_only_it_image__f1": 0,

249

"res_only_it_image__jaccard": 0,

250

"res_only_it_image__rouge1": 0,

251

"res_only_it_image__exact_match": 0,

252

}

253

-

254

for output_key in output.keys():

255

count = 0

256

query_domain, query_metric_name = output_key.split("__")

257

for inner_dict in results:

258

for inner_key, inner_value in inner_dict.items():

259

key_domain, key_metric_name, _ = inner_key.split("__")

260

if query_domain == key_domain and query_metric_name == key_metric_name:

261

output[output_key] += inner_value

262

count += 1

263

output[output_key] /= count

264

return output

240

scores = 0

241

count = 0

242

output_dict = {}

243

for i in range(len(results)):

244

for blank_id in range(len(results[i])):

245

scores += results[i][blank_id]["score"]

246

count += 1

247

output_dict[str(i)] = results[i]

248

+

249

now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

250

path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)

251

with open(path, "w") as f:

252

json.dump(output_dict, f)

253

print(f"Submission file saved to {path}")

254

eval_logger.info(f"Submission file saved to {path}")

255

return scores / count