@@ -6,6 +6,7 @@ |
|
|
6 |
6 |
from lmms_eval.api.instance import Instance |
7 |
7 |
from lmms_eval.api.model import lmms |
8 |
8 |
from lmms_eval.api.registry import register_model |
|
9 |
+from lmms_eval.tasks.mmmu.utils_group_img import process_images |
9 |
10 |
from accelerate import Accelerator, DistributedType |
10 |
11 |
from accelerate.state import AcceleratorState |
11 |
12 |
from typing import List, Optional, Union, Tuple |
@@ -187,7 +188,13 @@ def _collate(x): |
|
|
187 |
188 |
if "" in context: |
188 |
189 |
# instruct blip does not expect the tag |
189 |
190 |
context = context.replace("", "") |
190 |
|
-inputs = self._image_processor(images=visuals, text=context, return_tensors="pt").to(self.device) |
|
191 |
+# Set trunction equals true here, the max length for qformer tokenizer is 512 |
|
192 |
+# if not truncate, some questions will cause size mismatch |
|
193 |
+# The transformer implementation can't handle multi images for blip |
|
194 |
+# Concat it into one image |
|
195 |
+if len(visuals) > 1: |
|
196 |
+visuals = [process_images(visuals)] |
|
197 |
+inputs = self._image_processor(images=visuals, text=context, return_tensors="pt", truncation=True).to(self.device) |
191 |
198 |
|
192 |
199 |
gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] |
193 |
200 |
if "max_new_tokens" not in gen_kwargs: |