lmms-eval@03b2f7c (original) (raw)

`@@ -106,9 +106,16 @@ def parse_eval_args() -> argparse.Namespace:

106

`parser.add_argument(

107

`"--log_samples_suffix",

108

`type=str,

109

default="",

109

default="model_outputs",

110

`help="Specify a suffix for the log_samples file name.",

111

` )

112

parser.add_argument(

113

"--predict_only",

114

"-x",

115

action="store_true",

116

default=False,

117

help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",

118

)

112

119

`parser.add_argument(

113

120

`"--show_config",

114

121

`action="store_true",

`@@ -228,6 +235,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

228

235

229

236

`initialize_tasks(args.verbosity)

230

237

238

if args.predict_only:

239

args.log_samples = True

240

if (args.log_samples or args.predict_only) and not args.output_path:

241

raise ValueError("Specify --output_path if providing --log_samples or --predict_only")

231

242

`if args.limit:

232

243

`eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

233

244

`if args.include_path is not None:

`@@ -244,14 +255,17 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

244

255

`"\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70

245

256

` )

246

257

`eval_logger.info(log_message)

247

task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name="llava")

248

for task_name in task_dict.keys():

249

task_obj = task_dict[task_name]

250

if type(task_obj) == tuple:

251

group, task_obj = task_obj

252

if task_obj is None:

253

continue

254

eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")

258

for task_name in sorted(ALL_TASKS):

259

try:

260

task_dict = get_task_dict([task_name], model_name="llava")

261

task_obj = task_dict[task_name]

262

if type(task_obj) == tuple:

263

group, task_obj = task_obj

264

if task_obj is None:

265

continue

266

eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")

267

except Exception as e:

268

eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")

255

269

`sys.exit()

256

270

`else:

257

271

`tasks_list = args.tasks.split(",")

`@@ -271,6 +285,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

271

285

`# set datetime before evaluation

272

286

`datetime_str = utils.get_datetime_str(timezone=args.timezone)

273

287

`if args.output_path:

288

if args.log_samples_suffix and len(args.log_samples_suffix) > 15:

289

eval_logger.warning("The suffix for log_samples is too long. It is recommended to keep it under 15 characters.")

290

args.log_samples_suffix = args.log_samples_suffix[:5] + "..." + args.log_samples_suffix[-5:]

291

+

274

292

`hash_input = f"{args.model_args}".encode("utf-8")

275

293

`hash_output = hashlib.sha256(hash_input).hexdigest()[:6]

276

294

`path = Path(args.output_path)

`@@ -293,6 +311,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

293

311

`log_samples=args.log_samples,

294

312

`gen_kwargs=args.gen_kwargs,

295

313

`cli_args=args,

314

predict_only=args.predict_only,

296

315

` )

297

316

298

317

`if results is not None:

`@@ -315,9 +334,9 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

315

334

`for task_name, config in results["configs"].items():

316

335

`filename = args.output_path.joinpath(f"{task_name}.json")

317

336

`# Structure the data with 'args' and 'logs' keys

318

data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])} # Convert Namespace to dict

319

samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable)

320

filename.open("w").write(samples_dumped)

337

data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}

338

samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)

339

filename.open("w", encoding="utf-8").write(samples_dumped)

321

340

`eval_logger.info(f"Saved samples to {filename}")

322

341

323

342

`return results, samples