Fixing scoring logic · EvolvingLMMs-Lab/lmms-eval@8d963e1 (original) (raw)

1

1

`import json

`

2

``

-

``

2

`+

import re

`

3

3

`import os

`

4

4

`import requests

`

5

5

`import numpy as np

`

`@@ -78,14 +78,15 @@ def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):

`

78

78

`payload = {

`

79

79

`"model": GPT_EVAL_MODEL_NAME,

`

80

80

`"messages": [

`

81

``

`-

{"role": "system", "content": system_prompt},

`

``

81

`+

{"role": "system", "content": [{"type": "text", "text": system_prompt}]},

`

82

82

` {

`

83

83

`"role": "user",

`

84

84

`"content": [

`

85

85

` {"type": "text", "text": prompt},

`

86

``

`-

{

`

87

``

`-

"type": "image_url",

`

88

``

`-

"image_url": f"data:image/jpeg;base64,{base64_image}",

`

``

86

`+

{"type": "image_url",

`

``

87

`+

"image_url" : {

`

``

88

`+

"url" : f"data:image/jpeg;base64, {base64_image}"

`

``

89

`+

}

`

89

90

` },

`

90

91

` ],

`

91

92

` }

`

`@@ -101,16 +102,16 @@ def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):

`

101

102

`response_data = response.json()

`

102

103

`return response_data["choices"][0]["message"]["content"], GPT_EVAL_MODEL_NAME

`

103

104

`except requests.exceptions.RequestException as e:

`

104

``

`-

eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")

`

105

``

`-

time.sleep(wait_time)

`

``

105

`+

print(f"Request failed on attempt {attempt+1}: {e}")

`

106

106

`if attempt == max_retries - 1:

`

107

``

`-

eval_logger.error(f"Failed to get response after {max_retries} attempts")

`

``

107

`+

print(f"Failed to get response after {max_retries} attempts")

`

108

108

`return "", GPT_EVAL_MODEL_NAME

`

109

109

`except Exception as e:

`

110

``

`-

eval_logger.error(f"Error on attempt {attempt+1}: {e}")

`

``

110

`+

print(f"Error on attempt {attempt+1}: {e}")

`

111

111

`return "", GPT_EVAL_MODEL_NAME

`

112

112

``

113

113

``

``

114

+

114

115

`def image_to_base64(pil_image):

`

115

116

`buffered = BytesIO()

`

116

117

`pil_image.save(buffered, format="PNG")

`

`@@ -149,13 +150,29 @@ def wild_vision_process_results(doc, results):

`

149

150

`user_prompt = prompt_template.format(question_1=doc["instruction"], answer_1=doc[BASELINE_MODEL_NAME], answer_2=pred)

`

150

151

`base64_image = image_to_base64(doc["image"])

`

151

152

`resps, gpt_name = get_chat_response(base64_image, user_prompt)

`

152

``

`-

score, _ = get_score(resps, pattern="[[([AB<>=]+)]]")

`

153

``

`-

try:

`

154

``

`-

score = int(score)

`

155

``

`-

except:

`

156

``

`-

score = 0

`

``

153

`+

score, _ = get_score(resps, pattern=re.compile("[[([AB<>=]+)]]"))

`

``

154

+

``

155

`+

if "A>B" in score:

`

``

156

`+

final_score = -1

`

``

157

`+

judgement = "Worse" #Baseline better

`

``

158

`+

elif "A>>B" in score:

`

``

159

`+

final_score = -2

`

``

160

`+

judgement = "Worse++"

`

``

161

`+

elif "A=B" in score:

`

``

162

`+

final_score = 0

`

``

163

`+

judgement = "Tie"

`

``

164

`+

elif "B>A" in score:

`

``

165

`+

final_score = 1

`

``

166

`+

judgement = "Better"

`

``

167

`+

elif "B>>A" in score:

`

``

168

`+

final_score = 2

`

``

169

`+

judgement = "Better++"

`

``

170

`+

else:

`

``

171

`+

final_score = 0

`

``

172

`+

judgement = "Unclear"

`

``

173

+

157

174

``

158

``

`-

return {"gpt_eval_score" : {"question" : doc["instruction"], "score" : score, "gpt_resps" : resps, "ans_1" : doc[BASELINE_MODEL_NAME], "ans_2" : pred}}

`

``

175

`+

return {"gpt_eval_score" : {"question" : doc["instruction"], "score" : final_score, "gpt_resps" : resps, "ans_1" : doc[BASELINE_MODEL_NAME], "ans_2" : pred, "filtered_resps" : score, "judgement" : judgement}}

`

159

176

``

160

177

``

161

178

`def wild_vision_aggregation(results):

`