@@ -110,18 +110,6 @@ def __init__( |
|
|
110 |
110 |
overwrite_config["mm_spatial_pool_mode"] = self.mm_spatial_pool_mode |
111 |
111 |
cfg_pretrained = AutoConfig.from_pretrained(self.pretrained) |
112 |
112 |
|
113 |
|
-if cfg_pretrained.architectures[0] == "LlavaLlamaForCausalLM": # Ugly code, only used in vicuna that needs ROPE |
114 |
|
-if "224" in cfg_pretrained.mm_vision_tower: |
115 |
|
-least_token_number = self.max_frames_num * (16 // self.mm_spatial_pool_stride) ** 2 + 1000 |
116 |
|
-else: |
117 |
|
-least_token_number = self.max_frames_num * (24 // self.mm_spatial_pool_stride) ** 2 + 1000 |
118 |
|
- |
119 |
|
-scaling_factor = math.ceil(least_token_number / 4096) |
120 |
|
-if scaling_factor >= 2: |
121 |
|
-overwrite_config["rope_scaling"] = {"factor": float(scaling_factor), "type": "linear"} |
122 |
|
-overwrite_config["max_sequence_length"] = 4096 * scaling_factor |
123 |
|
-overwrite_config["tokenizer_model_max_length"] = 4096 * scaling_factor |
124 |
|
- |
125 |
113 |
llava_model_args["overwrite_config"] = overwrite_config |
126 |
114 |
try: |
127 |
115 |
# Try to load the model with the multimodal argument |