Fixing scoring logic · EvolvingLMMs-Lab/lmms-eval@8d963e1 (original) (raw)
1
1
`import json
`
2
``
-
``
2
`+
import re
`
3
3
`import os
`
4
4
`import requests
`
5
5
`import numpy as np
`
`@@ -78,14 +78,15 @@ def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
`
78
78
`payload = {
`
79
79
`"model": GPT_EVAL_MODEL_NAME,
`
80
80
`"messages": [
`
81
``
`-
{"role": "system", "content": system_prompt},
`
``
81
`+
{"role": "system", "content": [{"type": "text", "text": system_prompt}]},
`
82
82
` {
`
83
83
`"role": "user",
`
84
84
`"content": [
`
85
85
` {"type": "text", "text": prompt},
`
86
``
`-
{
`
87
``
`-
"type": "image_url",
`
88
``
`-
"image_url": f"data:image/jpeg;base64,{base64_image}",
`
``
86
`+
{"type": "image_url",
`
``
87
`+
"image_url" : {
`
``
88
`+
"url" : f"data:image/jpeg;base64, {base64_image}"
`
``
89
`+
}
`
89
90
` },
`
90
91
` ],
`
91
92
` }
`
`@@ -101,16 +102,16 @@ def get_chat_response(base64_image, prompt, max_retries=5, wait_time=10):
`
101
102
`response_data = response.json()
`
102
103
`return response_data["choices"][0]["message"]["content"], GPT_EVAL_MODEL_NAME
`
103
104
`except requests.exceptions.RequestException as e:
`
104
``
`-
eval_logger.warning(f"Request failed on attempt {attempt+1}: {e}")
`
105
``
`-
time.sleep(wait_time)
`
``
105
`+
print(f"Request failed on attempt {attempt+1}: {e}")
`
106
106
`if attempt == max_retries - 1:
`
107
``
`-
eval_logger.error(f"Failed to get response after {max_retries} attempts")
`
``
107
`+
print(f"Failed to get response after {max_retries} attempts")
`
108
108
`return "", GPT_EVAL_MODEL_NAME
`
109
109
`except Exception as e:
`
110
``
`-
eval_logger.error(f"Error on attempt {attempt+1}: {e}")
`
``
110
`+
print(f"Error on attempt {attempt+1}: {e}")
`
111
111
`return "", GPT_EVAL_MODEL_NAME
`
112
112
``
113
113
``
``
114
+
114
115
`def image_to_base64(pil_image):
`
115
116
`buffered = BytesIO()
`
116
117
`pil_image.save(buffered, format="PNG")
`
`@@ -149,13 +150,29 @@ def wild_vision_process_results(doc, results):
`
149
150
`user_prompt = prompt_template.format(question_1=doc["instruction"], answer_1=doc[BASELINE_MODEL_NAME], answer_2=pred)
`
150
151
`base64_image = image_to_base64(doc["image"])
`
151
152
`resps, gpt_name = get_chat_response(base64_image, user_prompt)
`
152
``
`-
score, _ = get_score(resps, pattern="[[([AB<>=]+)]]")
`
153
``
`-
try:
`
154
``
`-
score = int(score)
`
155
``
`-
except:
`
156
``
`-
score = 0
`
``
153
`+
score, _ = get_score(resps, pattern=re.compile("[[([AB<>=]+)]]"))
`
``
154
+
``
155
`+
if "A>B" in score:
`
``
156
`+
final_score = -1
`
``
157
`+
judgement = "Worse" #Baseline better
`
``
158
`+
elif "A>>B" in score:
`
``
159
`+
final_score = -2
`
``
160
`+
judgement = "Worse++"
`
``
161
`+
elif "A=B" in score:
`
``
162
`+
final_score = 0
`
``
163
`+
judgement = "Tie"
`
``
164
`+
elif "B>A" in score:
`
``
165
`+
final_score = 1
`
``
166
`+
judgement = "Better"
`
``
167
`+
elif "B>>A" in score:
`
``
168
`+
final_score = 2
`
``
169
`+
judgement = "Better++"
`
``
170
`+
else:
`
``
171
`+
final_score = 0
`
``
172
`+
judgement = "Unclear"
`
``
173
+
157
174
``
158
``
`-
return {"gpt_eval_score" : {"question" : doc["instruction"], "score" : score, "gpt_resps" : resps, "ans_1" : doc[BASELINE_MODEL_NAME], "ans_2" : pred}}
`
``
175
`+
return {"gpt_eval_score" : {"question" : doc["instruction"], "score" : final_score, "gpt_resps" : resps, "ans_1" : doc[BASELINE_MODEL_NAME], "ans_2" : pred, "filtered_resps" : score, "judgement" : judgement}}
`
159
176
``
160
177
``
161
178
`def wild_vision_aggregation(results):
`