lmms-eval@11fd7e3 (original) (raw)

1

from decord import VideoReader, cpu

2

import numpy as np

3

import os

4

import sys

5

import datetime

6

import lmms_eval.tasks._task_utils.file_utils as file_utils

7

import json

8

import logging

9

import yaml

10

from pathlib import Path

11

+

12

import requests

13

import openai

14

from openai import OpenAI

15

import time

16

import ast

17

from tqdm import tqdm

18

import random

19

+

20

import re

21

+

22

with open(Path(file).parent / "_default_template_yaml", "r") as f:

23

raw_data = f.readlines()

24

safe_data = []

25

for i, line in enumerate(raw_data):

26

remove function definition since yaml load cannot handle it

27

if "!function" not in line:

28

safe_data.append(line)

29

+

30

config = yaml.safe_load("".join(safe_data))

31

+

32

+

33

API_TYPE = os.getenv("API_TYPE", "openai")

34

+

35

if API_TYPE == "openai":

36

API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")

37

API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")

38

headers = {

39

"Authorization": f"Bearer {API_KEY}",

40

"Content-Type": "application/json",

41

}

42

+

43

We will unzip all the zip files

44

To HF HOME cache dir

45

And load it here

46

HF_HOME = os.environ["HF_HOME"]

47

cache_dir = config["dataset_kwargs"]["cache_dir"]

48

cache_dir = os.path.join(HF_HOME, cache_dir)

49

+

50

eval_logger = logging.getLogger("lmms-eval")

51

+

52

+

53

Pass in video path here

54

Can only work correctly with video llm

55

def vitatecs_doc_to_visual(doc):

56

video_path = os.path.join(cache_dir, doc["src_dataset"], doc["video_name"])

57

if os.path.exists(video_path):

58

video_path = video_path

59

else:

60

sys.exit(f"video path:{video_path} does not exist, please check")

61

return [video_path]

62

+

63

+

64

This is the place where you format your question

65

def vitatecs_doc_to_text(doc, model_specific_prompt_kwargs=None):

66

if model_specific_prompt_kwargs is None:

67

model_specific_prompt_kwargs = {}

68

pre_prompt = ""

69

post_prompt = ""

70

if "pre_prompt" in model_specific_prompt_kwargs:

71

pre_prompt = model_specific_prompt_kwargs["pre_prompt"]

72

if "post_prompt" in model_specific_prompt_kwargs:

73

post_prompt = model_specific_prompt_kwargs["post_prompt"]

74

+

75

question, _, _ = format_question_and_answer(doc)

76

return f"{pre_prompt}{question}{post_prompt}"

77

+

78

+

79

def process_option_for_question(sent):

80

if not sent.endswith("."):

81

sent += "."

82

return sent.capitalize()

83

+

84

+

85

def process_option_for_matching(sent):

86

if sent.endswith("."):

87

sent = sent[:-1]

88

return sent.lower()

89

+

90

+

91

def format_question_and_answer(doc):

92

seed = sum(ord(c) for c in doc['caption'] + doc['counterfactual']) % 100

93

random.seed(seed)

94

if random.random() > 0.5:

95

option_a = process_option_for_question(doc['caption'])

96

option_b = process_option_for_question(doc['counterfactual'])

97

answer = "(A) " + option_a

98

else:

99

option_a = process_option_for_question(doc['counterfactual'])

100

option_b = process_option_for_question(doc['caption'])

101

answer = "(B) " + option_b

102

options = [process_option_for_matching(doc['caption']), process_option_for_matching(doc['counterfactual'])]

103

+

104

question = f"Which of the following best describes the content of the video: \n(A) {option_a} \n(B) {option_b}"

105

return question, answer, options

106

+

107

+

108

def vitatecs_doc_to_answer(doc):

109

_, answer, _ = format_question_and_answer(doc)

110

return answer

111

+

112

+

113

Process result

114

def vitatecs_process_results(doc, result):

115

pred = result[0]

116

rating = 0

117

match_success = True

118

chatgpt_response = None

119

question, answer, options = format_question_and_answer(doc)

120

+

121

Some hand-crafted matching rules

122

if options[0] in pred.lower() and options[1] not in pred.lower():

123

rating = 1

124

elif options[1] in pred.lower() and options[0] not in pred.lower():

125

rating = 0

126

elif pred in ["A", "B"]:

127

rating = 1 if pred == answer[1] else 0

128

elif any(pred.startswith(prefix) for prefix in ["A.", "B."]):

129

rating = 1 if pred.split(".")[0] == answer[1] else 0

130

elif any(pred.startswith(prefix) for prefix in ["A)", "B)"]):

131

rating = 1 if pred.split(")")[0] == answer[1] else 0

132

elif any(pred.startswith(prefix) for prefix in ["(A)", "(B)"]):

133

rating = 1 if pred.split(")")[1] == answer[1] else 0

134

else:

135

Fail to match answer in the video-llm response. Use ChatGPT to evaluate.

136

match_success = False

137

+

138

base_prompt = """You will receive a caption matching question, the ground-truth answer and the prediction from a question answering (QA) model. Your task is to determine whether QA model prediction is correct, based on the question and ground-truth answer. If the prediction is correct, respond "Correct". If the prediction is incorrect, respond "Incorrect". """

139

prompt = f"""{base_prompt}\n\nCaption Matching Question: {question}\n\nGround-Truth Answer: {answer}\n\nModel Prediction: {pred}"""

140

chatgpt_response, rating = get_eval_result(prompt)

141

+

142

if not match_success:

143

return {

144

"accuracy": {

145

"src_dataset": doc["src_dataset"],

146

"video_id": doc["video_name"],

147

"question": question,

148

"gt-answer": answer,

149

"video-llm-prediction": pred,

150

"match_success": match_success,

151

"rating": rating,

152

"chatgpt_prompt": prompt,

153

"chatgpt_response": chatgpt_response,

154

"aspect": doc["aspect"],

155

156

}

157

else:

158

return {

159

"accuracy": {

160

"src_dataset": doc["src_dataset"],

161

"video_id": doc["video_name"],

162

"question": question,

163

"gt-answer": answer,

164

"video-llm-prediction": pred,

165

"match_success": match_success,

166

"rating": rating,

167

"aspect": doc["aspect"],

168

169

}

170

+

171

+

172

utils function for gpt_evaluation when rule-based matching is unsuccessful

173

def get_eval_result(prompt, maxtry=10, sys_prompt=None):

174

llm_output = None

175

while True:

176

try:

177

llm_output = get_llm_output(prompt, sys_prompt)

178

rating = llm_output_to_rating(llm_output)

179

return llm_output, rating

180

except:

181

if maxtry <= 0:

182

return llm_output, 0

183

maxtry -= 1

184

print(f"Not success! {maxtry} retries remaining...")

185

time.sleep(random.uniform(1, 2))

186

+

187

+

188

utils function for gpt evaluation

189

def get_llm_output(prompt, sys_prompt, max_tokens=128):

190

if sys_prompt is None:

191

sys_prompt = "You are an AI assistant for question answering."

192

data = {"max_tokens": max_tokens, "model": "gpt-3.5-turbo-1106", "temperature": 1.0, "top_p": 1, "presence_penalty": 1, "messages": [{"role": "system", "content": sys_prompt}, {"role": "user", "content": prompt}]}

193

response = requests.post(API_URL, headers=headers, data=json.dumps(data).encode("utf-8"))

194

result = response.content.decode("utf-8")

195

dict_result = json.loads(result)

196

llm_output = dict_result["choices"][0]["message"]["content"].strip()

197

return llm_output

198

+

199

+

200

utils function that converts gpt evaluation into rating

201

def llm_output_to_rating(llm_output):

202

assert "Correct" in llm_output or "Incorrect" in llm_output

203

if llm_output.startswith("Correct"):

204

rating = 1

205

elif llm_output.startswith("Incorrect"):

206

rating = 0

207

elif ("Correct" in llm_output) and ("Incorrect" not in llm_output):

208

rating = 1

209

elif "Incorrect" in llm_output:

210

rating = 0

211

return rating

212

+

213

+

214

Factory into different aggregate

215

def vitatecs_aggregate_rating(results, args):

216

yes_count = 0

217

+

218

results is a list of dict

219

for answer_dict in results:

220

if answer_dict["rating"] == 1:

221

yes_count += 1

222

+

223

accuracy = yes_count / len(results)

224

+

225

return accuracy * 100