Merge branch 'main' into dev/interleave · EvolvingLMMs-Lab/lmms-eval@d78ec86 (original) (raw)
`@@ -10,6 +10,8 @@
`
10
10
`import sys
`
11
11
`from typing import List, Dict, Optional, Union
`
12
12
`import re
`
``
13
`+
import cv2
`
``
14
`+
import numpy as np
`
13
15
``
14
16
`from loguru import logger as eval_logger
`
15
17
``
80
82
`# cache_dir = os.path.join(hf_home, cache_dir)
`
81
83
`# base_cache_dir = config["dataset_kwargs"]["cache_dir"]
`
82
84
`base_cache_dir = os.path.expanduser(hf_home)
`
83
``
-
``
85
`+
with open(Path(file).parent / "videomme.yaml", "r") as f:
`
``
86
`+
raw_data = f.readlines()
`
``
87
`+
safe_data = []
`
``
88
`+
for i, line in enumerate(raw_data):
`
``
89
`+
remove function definition since yaml load cannot handle it
`
``
90
`+
if "!function" not in line:
`
``
91
`+
safe_data.append(line)
`
``
92
`+
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
`
``
93
+
``
94
+
``
95
`+
def parse_subtitle_time(time_str):
`
``
96
`+
h, m, s_ms = time_str.split(':')
`
``
97
`+
s, ms = s_ms.split(',')
`
``
98
`+
return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000
`
``
99
+
``
100
`+
def load_subtitles(subtitle_path):
`
``
101
`+
subtitles = {}
`
``
102
`+
with open(subtitle_path, 'r', encoding='utf-8') as file:
`
``
103
`+
content = file.read().split('\n\n')
`
``
104
`+
for section in content:
`
``
105
`+
if section.strip():
`
``
106
`+
lines = section.split('\n')
`
``
107
`+
if len(lines) >= 3:
`
``
108
`+
time_range = lines[1].split(' --> ')
`
``
109
`+
start_time = parse_subtitle_time(time_range[0])
`
``
110
`+
end_time = parse_subtitle_time(time_range[1])
`
``
111
`+
text = ' '.join(line for line in lines[2:])
`
``
112
`+
subtitles[(start_time, end_time)] = text
`
``
113
`+
return subtitles
`
``
114
+
``
115
`+
def convert_time_to_frame(time_in_seconds, fps):
`
``
116
`+
return int(time_in_seconds * fps)
`
``
117
+
``
118
`+
def extract_subtitles(video_path, subtitle_path):
`
``
119
`+
video = cv2.VideoCapture(video_path)
`
``
120
`+
fps = video.get(cv2.CAP_PROP_FPS)
`
``
121
`+
total_frame=int(video.get(cv2.CAP_PROP_FRAME_COUNT))
`
``
122
`+
subtitles = load_subtitles(subtitle_path)
`
``
123
+
``
124
`+
subtitle_frames = []
`
``
125
`+
for (start_time, end_time), text in subtitles.items():
`
``
126
`+
start_frame = convert_time_to_frame(start_time, fps)
`
``
127
`+
end_frame = convert_time_to_frame(end_time, fps)
`
``
128
`+
subtitle_frames.append((start_frame, end_frame, text))
`
``
129
+
``
130
`+
return subtitle_frames,total_frame
`
84
131
``
85
132
`def videomme_doc_to_visual(doc):
`
86
``
`-
with open(Path(file).parent / "videomme.yaml", "r") as f:
`
87
``
`-
raw_data = f.readlines()
`
88
``
`-
safe_data = []
`
89
``
`-
for i, line in enumerate(raw_data):
`
90
``
`-
remove function definition since yaml load cannot handle it
`
91
``
`-
if "!function" not in line:
`
92
``
`-
safe_data.append(line)
`
93
``
`-
cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]
`
``
133
+
94
134
`cache_dir = os.path.join(base_cache_dir, cache_name)
`
95
135
`video_path = doc["videoID"] + ".mp4"
`
96
136
`video_path = os.path.join(cache_dir, video_path)
`
`@@ -106,6 +146,71 @@ def videomme_doc_to_visual(doc):
`
106
146
``
107
147
``
108
148
`def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):
`
``
149
`+
option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
`
``
150
`+
question = doc["question"]
`
``
151
`+
option = str(doc["options"])
`
``
152
`+
question = question + "\n" + option
`
``
153
`+
full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"
`
``
154
`+
return full_prompt
`
``
155
`+
Frames + Subs
`
``
156
`+
This video's subtitles are listed below:
`
``
157
`+
【subtitles】
`
``
158
+
``
159
`+
Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.
`
``
160
`+
【question】
`
``
161
`+
The best answer is:
`
``
162
`+
Frames / Frames + Audio
`
``
163
`+
Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.
`
``
164
`+
【question】
`
``
165
`+
The best answer is:
`
``
166
+
``
167
`+
def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):
`
``
168
`+
cache_dir = os.path.join(base_cache_dir, cache_name)
`
``
169
`+
video_path = doc["videoID"] + ".mp4"
`
``
170
`+
subtitle_path=os.path.join(cache_dir,"subtitle",doc["videoID"]+".srt")
`
``
171
`+
video_path = os.path.join(cache_dir, video_path)
`
``
172
`+
if os.path.exists(subtitle_path): #Denote have subtitle
`
``
173
`+
subtitle=open(subtitle_path).readlines()
`
``
174
`+
else:
`
``
175
`+
subtitle=""
`
``
176
`+
subtitles_prompt="This video's subtitles are listed below: \n"
`
``
177
`+
if subtitle=="":
`
``
178
`+
subtitle="No subtitles available"
`
``
179
`+
else:
`
``
180
`+
if "gemini_api_flag" in model_specific_prompt_kwargs: #specific for gemini_api
`
``
181
`+
if model_specific_prompt_kwargs['gemini_api_flag']=="full subtitle":
`
``
182
`+
textlist=[]
`
``
183
`+
for ele in subtitle:
`
``
184
`+
pattern = r'(.*?)'
`
``
185
`+
matches = re.findall(pattern, ele)
`
``
186
`+
if matches:
`
``
187
`+
textlist.append(matches[0])
`
``
188
`+
subtitle_text="\n".join(textlist)
`
``
189
`+
else:
`
``
190
`+
if "frame_num" in model_specific_prompt_kwargs:
`
``
191
`+
frame_num=model_specific_prompt_kwargs['frame_num']
`
``
192
`+
subtitle_by_frame,total_frame=extract_subtitles(video_path,subtitle_path)
`
``
193
`+
uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()
`
``
194
+
``
195
`+
subtitle_by_frame_idx=[]
`
``
196
`+
for frame_idx in uniform_sampled_frames:
`
``
197
`+
for idx,title in enumerate(subtitle_by_frame):
`
``
198
`+
if frame_idx<title[1] and frame_idx>=title[0]:
`
``
199
`+
subtitle_by_frame_idx.append(idx)
`
``
200
`+
subtitle_by_frame_idx=list(set(subtitle_by_frame_idx))
`
``
201
+
``
202
`+
textlist=[]
`
``
203
`+
for idx in subtitle_by_frame_idx:
`
``
204
`+
pattern = r'(.*?)'
`
``
205
`+
raw_text=re.findall(pattern, subtitle_by_frame[idx][2])
`
``
206
`+
try:
`
``
207
`+
textlist.append(raw_text[0])
`
``
208
`+
except:
`
``
209
`+
continue
`
``
210
`+
subtitle_text="\n".join(textlist)
`
``
211
`+
subtitle=subtitle_text
`
``
212
+
``
213
`+
option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."
`
109
214
`question = doc["question"]
`
110
215
`option = str(doc["options"])
`
111
216
`question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]
`