Merge branch 'main' into dev/interleave · EvolvingLMMs-Lab/lmms-eval@d78ec86 (original) (raw)

`@@ -10,6 +10,8 @@

`

10

10

`import sys

`

11

11

`from typing import List, Dict, Optional, Union

`

12

12

`import re

`

``

13

`+

import cv2

`

``

14

`+

import numpy as np

`

13

15

``

14

16

`from loguru import logger as eval_logger

`

15

17

``

80

82

`# cache_dir = os.path.join(hf_home, cache_dir)

`

81

83

`# base_cache_dir = config["dataset_kwargs"]["cache_dir"]

`

82

84

`base_cache_dir = os.path.expanduser(hf_home)

`

83

``

-

``

85

`+

with open(Path(file).parent / "videomme.yaml", "r") as f:

`

``

86

`+

raw_data = f.readlines()

`

``

87

`+

safe_data = []

`

``

88

`+

for i, line in enumerate(raw_data):

`

``

89

`+

remove function definition since yaml load cannot handle it

`

``

90

`+

if "!function" not in line:

`

``

91

`+

safe_data.append(line)

`

``

92

`+

cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]

`

``

93

+

``

94

+

``

95

`+

def parse_subtitle_time(time_str):

`

``

96

`+

h, m, s_ms = time_str.split(':')

`

``

97

`+

s, ms = s_ms.split(',')

`

``

98

`+

return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000

`

``

99

+

``

100

`+

def load_subtitles(subtitle_path):

`

``

101

`+

subtitles = {}

`

``

102

`+

with open(subtitle_path, 'r', encoding='utf-8') as file:

`

``

103

`+

content = file.read().split('\n\n')

`

``

104

`+

for section in content:

`

``

105

`+

if section.strip():

`

``

106

`+

lines = section.split('\n')

`

``

107

`+

if len(lines) >= 3:

`

``

108

`+

time_range = lines[1].split(' --> ')

`

``

109

`+

start_time = parse_subtitle_time(time_range[0])

`

``

110

`+

end_time = parse_subtitle_time(time_range[1])

`

``

111

`+

text = ' '.join(line for line in lines[2:])

`

``

112

`+

subtitles[(start_time, end_time)] = text

`

``

113

`+

return subtitles

`

``

114

+

``

115

`+

def convert_time_to_frame(time_in_seconds, fps):

`

``

116

`+

return int(time_in_seconds * fps)

`

``

117

+

``

118

`+

def extract_subtitles(video_path, subtitle_path):

`

``

119

`+

video = cv2.VideoCapture(video_path)

`

``

120

`+

fps = video.get(cv2.CAP_PROP_FPS)

`

``

121

`+

total_frame=int(video.get(cv2.CAP_PROP_FRAME_COUNT))

`

``

122

`+

subtitles = load_subtitles(subtitle_path)

`

``

123

+

``

124

`+

subtitle_frames = []

`

``

125

`+

for (start_time, end_time), text in subtitles.items():

`

``

126

`+

start_frame = convert_time_to_frame(start_time, fps)

`

``

127

`+

end_frame = convert_time_to_frame(end_time, fps)

`

``

128

`+

subtitle_frames.append((start_frame, end_frame, text))

`

``

129

+

``

130

`+

return subtitle_frames,total_frame

`

84

131

``

85

132

`def videomme_doc_to_visual(doc):

`

86

``

`-

with open(Path(file).parent / "videomme.yaml", "r") as f:

`

87

``

`-

raw_data = f.readlines()

`

88

``

`-

safe_data = []

`

89

``

`-

for i, line in enumerate(raw_data):

`

90

``

`-

remove function definition since yaml load cannot handle it

`

91

``

`-

if "!function" not in line:

`

92

``

`-

safe_data.append(line)

`

93

``

`-

cache_name = yaml.safe_load("".join(safe_data))["dataset_kwargs"]["cache_dir"]

`

``

133

+

94

134

`cache_dir = os.path.join(base_cache_dir, cache_name)

`

95

135

`video_path = doc["videoID"] + ".mp4"

`

96

136

`video_path = os.path.join(cache_dir, video_path)

`

`@@ -106,6 +146,71 @@ def videomme_doc_to_visual(doc):

`

106

146

``

107

147

``

108

148

`def videomme_doc_to_text(doc, model_specific_prompt_kwargs=None):

`

``

149

`+

option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."

`

``

150

`+

question = doc["question"]

`

``

151

`+

option = str(doc["options"])

`

``

152

`+

question = question + "\n" + option

`

``

153

`+

full_prompt=option_prompt+"\n"+question+"\n"+"The best answer is:"

`

``

154

`+

return full_prompt

`

``

155

`+

Frames + Subs

`

``

156

`+

This video's subtitles are listed below:

`

``

157

`+

【subtitles】

`

``

158

+

``

159

`+

Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option.

`

``

160

`+

【question】

`

``

161

`+

The best answer is:

`

``

162

`+

Frames / Frames + Audio

`

``

163

`+

Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.

`

``

164

`+

【question】

`

``

165

`+

The best answer is:

`

``

166

+

``

167

`+

def videomme_doc_to_text_subtitle(doc, model_specific_prompt_kwargs=None):

`

``

168

`+

cache_dir = os.path.join(base_cache_dir, cache_name)

`

``

169

`+

video_path = doc["videoID"] + ".mp4"

`

``

170

`+

subtitle_path=os.path.join(cache_dir,"subtitle",doc["videoID"]+".srt")

`

``

171

`+

video_path = os.path.join(cache_dir, video_path)

`

``

172

`+

if os.path.exists(subtitle_path): #Denote have subtitle

`

``

173

`+

subtitle=open(subtitle_path).readlines()

`

``

174

`+

else:

`

``

175

`+

subtitle=""

`

``

176

`+

subtitles_prompt="This video's subtitles are listed below: \n"

`

``

177

`+

if subtitle=="":

`

``

178

`+

subtitle="No subtitles available"

`

``

179

`+

else:

`

``

180

`+

if "gemini_api_flag" in model_specific_prompt_kwargs: #specific for gemini_api

`

``

181

`+

if model_specific_prompt_kwargs['gemini_api_flag']=="full subtitle":

`

``

182

`+

textlist=[]

`

``

183

`+

for ele in subtitle:

`

``

184

`+

pattern = r'(.*?)'

`

``

185

`+

matches = re.findall(pattern, ele)

`

``

186

`+

if matches:

`

``

187

`+

textlist.append(matches[0])

`

``

188

`+

subtitle_text="\n".join(textlist)

`

``

189

`+

else:

`

``

190

`+

if "frame_num" in model_specific_prompt_kwargs:

`

``

191

`+

frame_num=model_specific_prompt_kwargs['frame_num']

`

``

192

`+

subtitle_by_frame,total_frame=extract_subtitles(video_path,subtitle_path)

`

``

193

`+

uniform_sampled_frames = np.linspace(0, total_frame - 1, frame_num, dtype=int).tolist()

`

``

194

+

``

195

`+

subtitle_by_frame_idx=[]

`

``

196

`+

for frame_idx in uniform_sampled_frames:

`

``

197

`+

for idx,title in enumerate(subtitle_by_frame):

`

``

198

`+

if frame_idx<title[1] and frame_idx>=title[0]:

`

``

199

`+

subtitle_by_frame_idx.append(idx)

`

``

200

`+

subtitle_by_frame_idx=list(set(subtitle_by_frame_idx))

`

``

201

+

``

202

`+

textlist=[]

`

``

203

`+

for idx in subtitle_by_frame_idx:

`

``

204

`+

pattern = r'(.*?)'

`

``

205

`+

raw_text=re.findall(pattern, subtitle_by_frame[idx][2])

`

``

206

`+

try:

`

``

207

`+

textlist.append(raw_text[0])

`

``

208

`+

except:

`

``

209

`+

continue

`

``

210

`+

subtitle_text="\n".join(textlist)

`

``

211

`+

subtitle=subtitle_text

`

``

212

+

``

213

`+

option_prompt="Select the best answer to the following multiple-choice question based on the video and the subtitles. Respond with only the letter (A, B, C, or D) of the correct option."

`

109

214

`question = doc["question"]

`

110

215

`option = str(doc["options"])

`

111

216

`question = question + "\n" + option + model_specific_prompt_kwargs["post_prompt"]

`