include std and confidence interval · EvolvingLMMs-Lab/lmms-eval@326b969 (original) (raw)
`@@ -9,7 +9,7 @@
`
9
9
`import spacy
`
10
10
`from nltk.util import ngrams
`
11
11
`from spacy.cli import download
`
12
``
-
``
12
`+
import numpy as np
`
13
13
`from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
`
14
14
``
15
15
`# Download the English and Chinese models
`
`@@ -231,6 +231,7 @@ def vcr_zh_process_results(doc, results):
`
231
231
`for k in output.keys():
`
232
232
`output[k].append(
`
233
233
` {
`
``
234
`+
"question_id": doc["question_id"],
`
234
235
`"score": tmp[k],
`
235
236
`"pred_ngram": tmp["max_sim_string"],
`
236
237
`"gt_ngram": crossed_text[i],
`
`@@ -240,26 +241,53 @@ def vcr_zh_process_results(doc, results):
`
240
241
`return output
`
241
242
``
242
243
``
``
244
`+
def bootstrap_std(data, n_bootstrap=1000, ci=0.95):
`
``
245
`+
"""
`
``
246
`+
Args:
`
``
247
`+
data: a list of values
`
``
248
`+
n_bootstrap: number of bootstrap samples
`
``
249
`+
ci: confidence interval
`
``
250
`+
Returns:
`
``
251
`+
a tuple of mean, lower bound, upper bound
`
``
252
`+
"""
`
``
253
`+
n = len(data)
`
``
254
`+
means = []
`
``
255
`+
for _ in range(n_bootstrap):
`
``
256
`+
sample = np.random.choice(data, n, replace=True)
`
``
257
`+
means.append(np.mean(sample))
`
``
258
`+
means = np.array(means)
`
``
259
`+
lower_bound = np.percentile(means, (1 - ci) / 2 * 100)
`
``
260
`+
upper_bound = np.percentile(means, (1 + ci) / 2 * 100)
`
``
261
`+
std = np.std(means)
`
``
262
`+
return std, lower_bound, upper_bound
`
``
263
+
``
264
+
243
265
`def vcr_aggregate_results(results, args):
`
244
266
`"""
`
245
267
` Args:
`
246
268
` results: List[List[Dict]], list of results returned by process_results
`
247
269
` Returns:
`
248
270
` A float value representing the final score of jaccard index or exact match
`
249
271
` """
`
250
``
`-
scores = 0
`
251
``
`-
count = 0
`
252
``
`-
output_dict = {}
`
``
272
`+
scores = []
`
``
273
`+
output_dict_detail_result = {}
`
253
274
`for i in range(len(results)):
`
254
275
`for blank_id in range(len(results[i])):
`
255
``
`-
scores += results[i][blank_id]["score"]
`
256
``
`-
count += 1
`
257
``
`-
output_dict[str(i)] = results[i]
`
258
``
-
``
276
`+
scores.append(results[i][blank_id]["score"])
`
``
277
`+
output_dict_detail_result[str(i)] = results[i]
`
``
278
`+
mean_score = np.mean(scores)
`
``
279
`+
std, lb, ub = bootstrap_std(scores, n_bootstrap=1000, ci=0.95)
`
``
280
`+
output_dict = {
`
``
281
`+
"mean_score": mean_score,
`
``
282
`+
"std_score": std,
`
``
283
`+
"percentile_2.5": lb,
`
``
284
`+
"percentie_97.5": ub,
`
``
285
`+
"detailed_results": output_dict_detail_result,
`
``
286
`+
}
`
259
287
`now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
`
260
288
`path = generate_submission_file(f"vcr_submission_{now_date_time}.json", args)
`
261
``
`-
with open(path, "w", encoding='utf-8') as f:
`
``
289
`+
with open(path, "w", encoding="utf-8") as f:
`
262
290
`json.dump(output_dict, f, indent=4, ensure_ascii=False)
`
263
291
`# print(f"Submission file saved to {path}")
`
264
292
`eval_logger.info(f"Submission file saved to {path}")
`
265
``
`-
return scores / count
`
``
293
`+
return mean_score
`