chore: Update lmms_eval/models/vila.py and lmms_eval/tasks/init.py · EvolvingLMMs-Lab/lmms-eval@e31cd78 (original) (raw)

`@@ -34,8 +34,8 @@

`

34

34

`from llava.mm_utils import process_images

`

35

35

`except ImportError as e:

`

36

36

`print(e)

`

37

``

`-

import pdb;pdb.set_trace()

`

38

``

`-

eval_logger.debug("VILA is not installed. Please install VILA to use this model.")

`

``

37

+

``

38

`+

eval_logger.debug("VILA is not installed. Please install VILA to use this model. Error: {e}")

`

39

39

``

40

40

``

41

41

`@register_model("vila")

`

`@@ -202,7 +202,7 @@ def load_video(self, video_path, max_frames_num):

`

202

202

`return [Image.fromarray(img) for img in spare_frames]

`

203

203

`except Exception as e:

`

204

204

`eval_logger.error(f"Failed to load video {video_path} with error: {e}")

`

205

``

`-

import pdb;pdb.set_trace()

`

``

205

+

206

206

`return [Image.new("RGB", (448, 448), (0, 0, 0))] * max_frames_num

`

207

207

``

208

208

`def tok_decode(self, tokens):

`

`@@ -279,7 +279,7 @@ def generate_until(self, requests) -> List[str]:

`

279

279

``

280

280

`for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:

`

281

281

`# if self.task_dict[task][split][doc_id]["duration"] != "short":

`

282

``

`-

# import pdb;pdb.set_trace()

`

``

282

`+

`

283

283

`# res.append("A")

`

284

284

`# pbar.update(1)

`

285

285

`# continue

`

`@@ -289,20 +289,20 @@ def generate_until(self, requests) -> List[str]:

`

289

289

``

290

290

`num_video_frames = self.model.config.num_video_frames

`

291

291

`videos = []

`

292

``

`-

import pdb;pdb.set_trace()

`

``

292

+

293

293

`if self.max_frames_num == 0:

`

294

294

`images = [Image.new("RGB", (448, 448), (0, 0, 0))] * num_video_frames

`

295

295

`video = process_images(images, self.model.image_processor, self.model.config).half().cuda()

`

296

296

`videos.append(video)

`

297

297

`else:

`

298

298

`for visual in visuals:

`

299

299

`# images, video_loading_succeed = LazySupervisedDataset._load_video(visual, num_video_frames, self.model)

`

300

``

`-

import pdb;pdb.set_trace()

`

``

300

+

301

301

`if self.video_decode_backend == "decord":

`

302

302

`images = self.load_video(visual, num_video_frames)

`

303

303

`elif self.video_decode_backend == "pyav":

`

304

304

`images = read_video_pyav(visual, num_frm=num_video_frames)

`

305

``

`-

import pdb;pdb.set_trace()

`

``

305

+

306

306

`video = process_images(images, self.model.image_processor, self.model.config).half().cuda()

`

307

307

`videos.append(video)

`

308

308

``

`@@ -350,7 +350,7 @@ def generate_until(self, requests) -> List[str]:

`

350

350

`if "num_beams" not in gen_kwargs:

`

351

351

`gen_kwargs["num_beams"] = 1

`

352

352

``

353

``

`-

import pdb;pdb.set_trace()

`

``

353

+

354

354

`with torch.inference_mode():

`

355

355

`output_ids = self.model.generate(

`

356

356

`input_ids=input_ids,

`

`@@ -370,7 +370,7 @@ def generate_until(self, requests) -> List[str]:

`

370

370

`outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

`

371

371

`print("Question: ", cur_prompt)

`

372

372

`print("Answer: ", outputs)

`

373

``

`-

import pdb;pdb.set_trace()

`

``

373

+

374

374

`res.append(outputs)

`

375

375

`pbar.update(1)

`

376

376

`return res

`