Merge branch 'main' of https://github.com/EvolvingLMMs-Lab/lmms-eval · EvolvingLMMs-Lab/lmms-eval@511b625 (original) (raw)

`@@ -106,9 +106,16 @@ def parse_eval_args() -> argparse.Namespace:

`

106

106

`parser.add_argument(

`

107

107

`"--log_samples_suffix",

`

108

108

`type=str,

`

109

``

`-

default="",

`

``

109

`+

default="model_outputs",

`

110

110

`help="Specify a suffix for the log_samples file name.",

`

111

111

` )

`

``

112

`+

parser.add_argument(

`

``

113

`+

"--predict_only",

`

``

114

`+

"-x",

`

``

115

`+

action="store_true",

`

``

116

`+

default=False,

`

``

117

`+

help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",

`

``

118

`+

)

`

112

119

`parser.add_argument(

`

113

120

`"--show_config",

`

114

121

`action="store_true",

`

`@@ -228,6 +235,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

`

228

235

``

229

236

`initialize_tasks(args.verbosity)

`

230

237

``

``

238

`+

if args.predict_only:

`

``

239

`+

args.log_samples = True

`

``

240

`+

if (args.log_samples or args.predict_only) and not args.output_path:

`

``

241

`+

raise ValueError("Specify --output_path if providing --log_samples or --predict_only")

`

231

242

`if args.limit:

`

232

243

`eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

`

233

244

`if args.include_path is not None:

`

`@@ -244,14 +255,17 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

`

244

255

`"\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70

`

245

256

` )

`

246

257

`eval_logger.info(log_message)

`

247

``

`-

task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name="llava")

`

248

``

`-

for task_name in task_dict.keys():

`

249

``

`-

task_obj = task_dict[task_name]

`

250

``

`-

if type(task_obj) == tuple:

`

251

``

`-

group, task_obj = task_obj

`

252

``

`-

if task_obj is None:

`

253

``

`-

continue

`

254

``

`-

eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")

`

``

258

`+

for task_name in sorted(ALL_TASKS):

`

``

259

`+

try:

`

``

260

`+

task_dict = get_task_dict([task_name], model_name="llava")

`

``

261

`+

task_obj = task_dict[task_name]

`

``

262

`+

if type(task_obj) == tuple:

`

``

263

`+

group, task_obj = task_obj

`

``

264

`+

if task_obj is None:

`

``

265

`+

continue

`

``

266

`+

eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")

`

``

267

`+

except Exception as e:

`

``

268

`+

eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}")

`

255

269

`sys.exit()

`

256

270

`else:

`

257

271

`tasks_list = args.tasks.split(",")

`

`@@ -271,6 +285,10 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

`

271

285

`# set datetime before evaluation

`

272

286

`datetime_str = utils.get_datetime_str(timezone=args.timezone)

`

273

287

`if args.output_path:

`

``

288

`+

if args.log_samples_suffix and len(args.log_samples_suffix) > 15:

`

``

289

`+

eval_logger.warning("The suffix for log_samples is too long. It is recommended to keep it under 15 characters.")

`

``

290

`+

args.log_samples_suffix = args.log_samples_suffix[:5] + "..." + args.log_samples_suffix[-5:]

`

``

291

+

274

292

`hash_input = f"{args.model_args}".encode("utf-8")

`

275

293

`hash_output = hashlib.sha256(hash_input).hexdigest()[:6]

`

276

294

`path = Path(args.output_path)

`

`@@ -293,6 +311,7 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

`

293

311

`log_samples=args.log_samples,

`

294

312

`gen_kwargs=args.gen_kwargs,

`

295

313

`cli_args=args,

`

``

314

`+

predict_only=args.predict_only,

`

296

315

` )

`

297

316

``

298

317

`if results is not None:

`

`@@ -315,9 +334,9 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:

`

315

334

`for task_name, config in results["configs"].items():

`

316

335

`filename = args.output_path.joinpath(f"{task_name}.json")

`

317

336

`# Structure the data with 'args' and 'logs' keys

`

318

``

`-

data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"])} # Convert Namespace to dict

`

319

``

`-

samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable)

`

320

``

`-

filename.open("w").write(samples_dumped)

`

``

337

`+

data_to_dump = {"args": vars(args), "model_configs": config, "logs": sorted(samples[task_name], key=lambda x: x["doc_id"]), "time": datetime_str}

`

``

338

`+

samples_dumped = json.dumps(data_to_dump, indent=4, default=_handle_non_serializable, ensure_ascii=False)

`

``

339

`+

filename.open("w", encoding="utf-8").write(samples_dumped)

`

321

340

`eval_logger.info(f"Saved samples to {filename}")

`

322

341

``

323

342

`return results, samples

`