include std and confidence interval · dadwadw233/lmms-eval@93e02a0 (original) (raw)

`@@ -9,7 +9,7 @@

9

`import spacy

10

`from nltk.util import ngrams

11

`from spacy.cli import download

12

-

12

import numpy as np

13

`from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

14

15

`# Download the English and Chinese models

`@@ -231,6 +231,7 @@ def vcr_zh_process_results(doc, results):

231

`for k in output.keys():

232

`output[k].append(

233

` {

234

"question_id": doc["question_id"],

234

235

`"score": tmp[k],

235

236

`"pred_ngram": tmp["max_sim_string"],

236

237

`"gt_ngram": crossed_text[i],

`@@ -240,26 +241,53 @@ def vcr_zh_process_results(doc, results):

240

241

`return output

241

242

243

244

def bootstrap_std(data, n_bootstrap=1000, ci=0.95):

245

"""

246

Args:

247

data: a list of values

248

n_bootstrap: number of bootstrap samples

249

ci: confidence interval

250

Returns:

251

a tuple of mean, lower bound, upper bound

252

"""

253

n = len(data)

254

means = []

255

for _ in range(n_bootstrap):

256

sample = np.random.choice(data, n, replace=True)

257

means.append(np.mean(sample))

258

means = np.array(means)

259

lower_bound = np.percentile(means, (1 - ci) / 2 * 100)

260

upper_bound = np.percentile(means, (1 + ci) / 2 * 100)

261

std = np.std(means)

262

return std, lower_bound, upper_bound

263

+

264

+

243

265

`def vcr_aggregate_results(results, args):

244

266

`"""

245

267

` Args:

246

268

` results: List[List[Dict]], list of results returned by process_results

247

269

` Returns:

248

270

` A float value representing the final score of jaccard index or exact match

249

271

` """

250

scores = 0

251

count = 0

252

output_dict = {}

272

scores = []

273

output_dict_detail_result = {}

253

274

`for i in range(len(results)):

254

275

`for blank_id in range(len(results[i])):

255

scores += results[i][blank_id]["score"]

256

count += 1

257

output_dict[str(i)] = results[i]

258

-

276

scores.append(results[i][blank_id]["score"])

277

output_dict_detail_result[str(i)] = results[i]

278

mean_score = np.mean(scores)

279

std, lb, ub = bootstrap_std(scores, n_bootstrap=1000, ci=0.95)

280

output_dict = {

281

"mean_score": mean_score,

282

"std_score": std,

283

"percentile_2.5": lb,

284

"percentie_97.5": ub,

285

"detailed_results": output_dict_detail_result,

286

}

259

287

`now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

260

288

`path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)

261

with open(path, "w", encoding='utf-8') as f:

289

with open(path, "w", encoding="utf-8") as f:

262

290

`json.dump(output_dict, f, indent=4, ensure_ascii=False)

263

291

`# print(f"Submission file saved to {path}")

264

292

`eval_logger.info(f"Submission file saved to {path}")

265

return scores / count

293

return mean_score