Merge branch 'main' of https://github.com/EvolvingLMMs-Lab/lmms-eval · dadwadw233/lmms-eval@03b2f7c (original) (raw)
`@@ -106,9 +106,16 @@ def parse_eval_args() -> argparse.Namespace:
`
106
106
`parser.add_argument(
`
107
107
`"--log_samples_suffix",
`
108
108
`type=str,
`
109
``
`-
default="",
`
``
109
`+
default="model_outputs",
`
110
110
`help="Specify a suffix for the log_samples file name.",
`
111
111
` )
`
``
112
`+
parser.add_argument(
`
``
113
`+
"--predict_only",
`
``
114
`+
"-x",
`
``
115
`+
action="store_true",
`
``
116
`+
default=False,
`
``
117
`+
help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
`
``
118
`+
)
`
112
119
`parser.add_argument(
`
113
120
`"--show_config",
`
114
121
`action="store_true",
`
`@@ -228,6 +235,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
`
228
235
``
229
236
`initialize_tasks(args.verbosity)
`
230
237
``
``
238
`+
if args.predict_only:
`
``
239
`+
args.log_samples = True
`
``
240
`+
if (args.log_samples or args.predict_only) and not args.output_path:
`
``
241
`+
raise ValueError("Specify --output_path if providing --log_samples or --predict_only")
`
231
242
`if args.limit:
`
232
243
`eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
`
233
244
`if args.include_path is not None:
`
`@@ -244,14 +255,17 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
`
244
255
`"\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
`
245
256
` )
`
246
257
`eval_logger.info(log_message)
`
247
``
`-
task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name="llava")
`
248
``
`-
for task_name in task_dict.keys():
`
249
``
`-
task_obj = task_dict[task_name]
`
250
``
`-
if type(task_obj) == tuple:
`
251
``
`-
group, task_obj = task_obj
`
252
``
`-
if task_obj is None:
`
253
``
`-
continue
`
254
``
`-
eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
`
``
258
`+
for task_name in sorted(ALL_TASKS):
`
``
259
`+
try:
`
``
260
`+
task_dict = get_task_dict([task_name], model_name="llava")
`
``
261
`+
task_obj = task_dict[task_name]
`
``
262
`+
if type(task_obj) == tuple:
`
``
263
`+
group, task_obj = task_obj
`
``
264
`+
if task_obj is None:
`
``
265
`+
continue
`
``
266
`+
eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
`
``
267
`+
except Exception as e:
`
``
268
`+
eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")
`
255
269
`sys.exit()
`
256
270
`else:
`
257
271
`tasks_list = args.tasks.split(",")
`
`@@ -271,6 +285,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
`
271
285
`# set datetime before evaluation
`
272
286
`datetime_str = utils.get_datetime_str(timezone=args.timezone)
`
273
287
`if args.output_path:
`
``
288
`+
if args.log_samples_suffix and len(args.log_samples_suffix) > 15:
`
``
289
`+
eval_logger.warning("The suffix for log_samples is too long. It is recommended to keep it under 15 characters.")
`
``
290
`+
args.log_samples_suffix = args.log_samples_suffix[:5] + "..." + args.log_samples_suffix[-5:]
`
``
291
+
274
292
`hash_input = f"{args.model_args}".encode("utf-8")
`
275
293
`hash_output = hashlib.sha256(hash_input).hexdigest()[:6]
`
276
294
`path = Path(args.output_path)
`
`@@ -293,6 +311,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
`
293
311
`log_samples=args.log_samples,
`
294
312
`gen_kwargs=args.gen_kwargs,
`
295
313
`cli_args=args,
`
``
314
`+
predict_only=args.predict_only,
`
296
315
` )
`
297
316
``
298
317
`if results is not None:
`
`@@ -315,9 +334,9 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
`
315
334
`for task_name, config in results["configs"].items():
`
316
335
`filename = args.output_path.joinpath(f"{task_name}.json")
`
317
336
`# Structure the data with 'args' and 'logs' keys
`
318
``
`-
data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])} # Convert Namespace to dict
`
319
``
`-
samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable)
`
320
``
`-
filename.open("w").write(samples_dumped)
`
``
337
`+
data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}
`
``
338
`+
samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)
`
``
339
`+
filename.open("w", encoding="utf-8").write(samples_dumped)
`
321
340
`eval_logger.info(f"Saved samples to {filename}")
`
322
341
``
323
342
`return results, samples
`