chore: Update lmms_eval/models/vila.py and lmms_eval/tasks/init.py · EvolvingLMMs-Lab/lmms-eval@e31cd78 (original) (raw)
`@@ -34,8 +34,8 @@
`
34
34
`from llava.mm_utils import process_images
`
35
35
`except ImportError as e:
`
36
36
`print(e)
`
37
``
`-
import pdb;pdb.set_trace()
`
38
``
`-
eval_logger.debug("VILA is not installed. Please install VILA to use this model.")
`
``
37
+
``
38
`+
eval_logger.debug("VILA is not installed. Please install VILA to use this model. Error: {e}")
`
39
39
``
40
40
``
41
41
`@register_model("vila")
`
`@@ -202,7 +202,7 @@ def load_video(self, video_path, max_frames_num):
`
202
202
`return [Image.fromarray(img) for img in spare_frames]
`
203
203
`except Exception as e:
`
204
204
`eval_logger.error(f"Failed to load video {video_path} with error: {e}")
`
205
``
`-
import pdb;pdb.set_trace()
`
``
205
+
206
206
`return [Image.new("RGB", (448, 448), (0, 0, 0))] * max_frames_num
`
207
207
``
208
208
`def tok_decode(self, tokens):
`
`@@ -279,7 +279,7 @@ def generate_until(self, requests) -> List[str]:
`
279
279
``
280
280
`for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
`
281
281
`# if self.task_dict[task][split][doc_id]["duration"] != "short":
`
282
``
`-
# import pdb;pdb.set_trace()
`
``
282
`+
`
283
283
`# res.append("A")
`
284
284
`# pbar.update(1)
`
285
285
`# continue
`
`@@ -289,20 +289,20 @@ def generate_until(self, requests) -> List[str]:
`
289
289
``
290
290
`num_video_frames = self.model.config.num_video_frames
`
291
291
`videos = []
`
292
``
`-
import pdb;pdb.set_trace()
`
``
292
+
293
293
`if self.max_frames_num == 0:
`
294
294
`images = [Image.new("RGB", (448, 448), (0, 0, 0))] * num_video_frames
`
295
295
`video = process_images(images, self.model.image_processor, self.model.config).half().cuda()
`
296
296
`videos.append(video)
`
297
297
`else:
`
298
298
`for visual in visuals:
`
299
299
`# images, video_loading_succeed = LazySupervisedDataset._load_video(visual, num_video_frames, self.model)
`
300
``
`-
import pdb;pdb.set_trace()
`
``
300
+
301
301
`if self.video_decode_backend == "decord":
`
302
302
`images = self.load_video(visual, num_video_frames)
`
303
303
`elif self.video_decode_backend == "pyav":
`
304
304
`images = read_video_pyav(visual, num_frm=num_video_frames)
`
305
``
`-
import pdb;pdb.set_trace()
`
``
305
+
306
306
`video = process_images(images, self.model.image_processor, self.model.config).half().cuda()
`
307
307
`videos.append(video)
`
308
308
``
`@@ -350,7 +350,7 @@ def generate_until(self, requests) -> List[str]:
`
350
350
`if "num_beams" not in gen_kwargs:
`
351
351
`gen_kwargs["num_beams"] = 1
`
352
352
``
353
``
`-
import pdb;pdb.set_trace()
`
``
353
+
354
354
`with torch.inference_mode():
`
355
355
`output_ids = self.model.generate(
`
356
356
`input_ids=input_ids,
`
`@@ -370,7 +370,7 @@ def generate_until(self, requests) -> List[str]:
`
370
370
`outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
`
371
371
`print("Question: ", cur_prompt)
`
372
372
`print("Answer: ", outputs)
`
373
``
`-
import pdb;pdb.set_trace()
`
``
373
+
374
374
`res.append(outputs)
`
375
375
`pbar.update(1)
`
376
376
`return res
`