Fix instructblip qformer size mismatch and multi-images problem · EvolvingLMMs-Lab/lmms-eval@0932932 (original) (raw)

Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@
6 6 from lmms_eval.api.instance import Instance
7 7 from lmms_eval.api.model import lmms
8 8 from lmms_eval.api.registry import register_model
9 +from lmms_eval.tasks.mmmu.utils_group_img import process_images
9 10 from accelerate import Accelerator, DistributedType
10 11 from accelerate.state import AcceleratorState
11 12 from typing import List, Optional, Union, Tuple
@@ -187,7 +188,13 @@ def _collate(x):
187 188 if "" in context:
188 189 # instruct blip does not expect the tag
189 190 context = context.replace("", "")
190 -inputs = self._image_processor(images=visuals, text=context, return_tensors="pt").to(self.device)
191 +# Set trunction equals true here, the max length for qformer tokenizer is 512
192 +# if not truncate, some questions will cause size mismatch
193 +# The transformer implementation can't handle multi images for blip
194 +# Concat it into one image
195 +if len(visuals) > 1:
196 +visuals = [process_images(visuals)]
197 +inputs = self._image_processor(images=visuals, text=context, return_tensors="pt", truncation=True).to(self.device)
191 198
192 199 gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
193 200 if "max_new_tokens" not in gen_kwargs: