include std and confidence interval · dadwadw233/lmms-eval@93e02a0 (original) (raw)

`@@ -9,7 +9,7 @@

`

9

9

`import spacy

`

10

10

`from nltk.util import ngrams

`

11

11

`from spacy.cli import download

`

12

``

-

``

12

`+

import numpy as np

`

13

13

`from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

`

14

14

``

15

15

`# Download the English and Chinese models

`

`@@ -231,6 +231,7 @@ def vcr_zh_process_results(doc, results):

`

231

231

`for k in output.keys():

`

232

232

`output[k].append(

`

233

233

` {

`

``

234

`+

"question_id": doc["question_id"],

`

234

235

`"score": tmp[k],

`

235

236

`"pred_ngram": tmp["max_sim_string"],

`

236

237

`"gt_ngram": crossed_text[i],

`

`@@ -240,26 +241,53 @@ def vcr_zh_process_results(doc, results):

`

240

241

`return output

`

241

242

``

242

243

``

``

244

`+

def bootstrap_std(data, n_bootstrap=1000, ci=0.95):

`

``

245

`+

"""

`

``

246

`+

Args:

`

``

247

`+

data: a list of values

`

``

248

`+

n_bootstrap: number of bootstrap samples

`

``

249

`+

ci: confidence interval

`

``

250

`+

Returns:

`

``

251

`+

a tuple of mean, lower bound, upper bound

`

``

252

`+

"""

`

``

253

`+

n = len(data)

`

``

254

`+

means = []

`

``

255

`+

for _ in range(n_bootstrap):

`

``

256

`+

sample = np.random.choice(data, n, replace=True)

`

``

257

`+

means.append(np.mean(sample))

`

``

258

`+

means = np.array(means)

`

``

259

`+

lower_bound = np.percentile(means, (1 - ci) / 2 * 100)

`

``

260

`+

upper_bound = np.percentile(means, (1 + ci) / 2 * 100)

`

``

261

`+

std = np.std(means)

`

``

262

`+

return std, lower_bound, upper_bound

`

``

263

+

``

264

+

243

265

`def vcr_aggregate_results(results, args):

`

244

266

`"""

`

245

267

` Args:

`

246

268

` results: List[List[Dict]], list of results returned by process_results

`

247

269

` Returns:

`

248

270

` A float value representing the final score of jaccard index or exact match

`

249

271

` """

`

250

``

`-

scores = 0

`

251

``

`-

count = 0

`

252

``

`-

output_dict = {}

`

``

272

`+

scores = []

`

``

273

`+

output_dict_detail_result = {}

`

253

274

`for i in range(len(results)):

`

254

275

`for blank_id in range(len(results[i])):

`

255

``

`-

scores += results[i][blank_id]["score"]

`

256

``

`-

count += 1

`

257

``

`-

output_dict[str(i)] = results[i]

`

258

``

-

``

276

`+

scores.append(results[i][blank_id]["score"])

`

``

277

`+

output_dict_detail_result[str(i)] = results[i]

`

``

278

`+

mean_score = np.mean(scores)

`

``

279

`+

std, lb, ub = bootstrap_std(scores, n_bootstrap=1000, ci=0.95)

`

``

280

`+

output_dict = {

`

``

281

`+

"mean_score": mean_score,

`

``

282

`+

"std_score": std,

`

``

283

`+

"percentile_2.5": lb,

`

``

284

`+

"percentie_97.5": ub,

`

``

285

`+

"detailed_results": output_dict_detail_result,

`

``

286

`+

}

`

259

287

`now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

`

260

288

`path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)

`

261

``

`-

with open(path, "w", encoding='utf-8') as f:

`

``

289

`+

with open(path, "w", encoding="utf-8") as f:

`

262

290

`json.dump(output_dict, f, indent=4, ensure_ascii=False)

`

263

291

`# print(f"Submission file saved to {path}")

`

264

292

`eval_logger.info(f"Submission file saved to {path}")

`

265

``

`-

return scores / count

`

``

293

`+

return mean_score

`