add upd · EvolvingLMMs-Lab/lmms-eval@71401ba (original) (raw)

File tree

25 files changed

lines changed

25 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -190,6 +190,19 @@ We also provide the raw data exported from Weights & Biases for the detailed res
190 190 - MMMU (mmmu)
191 191 - MMMU Validation (mmmu_val)
192 192 - MMMU Test (mmmu_test)
193 +- MMUPDBench (mmupdbench)
194 +- MMUPDBench Base (mmupdbench_base)
195 +- MMAADBench Base (mmaadbench_base)
196 +- MMIASDBench Base (mmiasdbench_base)
197 +- MMIVQDBench Base (mmivqdbench_base)
198 +- MMUPDBench Option (mmupdbench_option)
199 +- MMAADBench Option (mmaadbench_option)
200 +- MMIASDBench Option (mmiasdbench_option)
201 +- MMIVQDBench Option (mmivqdbench_option)
202 +- MMUPDBench Instruction (mmupdbench_instruction)
203 +- MMAADBench Instruction (mmaadbench_instruction)
204 +- MMIASDBench Instruction (mmiasdbench_instruction)
205 +- MMIVQDBench Instruction (mmivqdbench_instruction)
193 206 - MMVet (mmvet)
194 207 - Multi-DocVQA (multidocvqa)
195 208 - Multi-DocVQA Validation (multidocvqa_val)
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\n"
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmaad_base
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmaad_base
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\nAnswer with the option's letter from the given choices directly."
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmaad_option
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\n"
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmiasd_base
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmiasd_base
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\nAnswer with the option's letter from the given choices directly."
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmiasd_option
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\n"
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmivqd_base
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\nIf the given image is irrelevant to the question, answer \"F. The image and question are irrelevant.\"."
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmivqd_base
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +model_specific_prompt_kwargs:
4 + default:
5 + pre_prompt: ""
6 + post_prompt: "\nAnswer with the option's letter from the given choices directly."
7 +doc_to_visual: !function utils.mmupdbench_doc_to_visual
8 +doc_to_text: !function utils.mmupdbench_doc_to_text
9 +doc_to_target: "answer"
10 +process_results: !function utils.mmupdbench_process_results
11 +model_specific_generation_kwargs:
12 + llava:
13 + image_aspect_ratio: original
14 +output_type: generate_until
15 +dataset_name: mmivqd_option
16 +generation_kwargs:
17 + until:
18 + - "ASSISTANT:"
19 + max_new_tokens: 1024
20 + temperature: 0
21 + top_p: 0
22 + num_beams: 1
23 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmaadbench_base"
2 +test_split: test
3 +include: _default_template_mmaadbench_base_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmaadbench_base
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmaadbench_instruction"
2 +test_split: test
3 +include: _default_template_mmaadbench_instruction_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmaadbench_instruction
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmaadbench_option"
2 +test_split: test
3 +include: _default_template_mmaadbench_option_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmaadbench_option
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmiasdbench_base"
2 +test_split: test
3 +include: _default_template_mmiasdbench_base_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmiasdbench_base
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmiasdbench_instruction"
2 +test_split: test
3 +include: _default_template_mmiasdbench_instruction_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmiasdbench_instruction
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmiasdbench_option"
2 +test_split: test
3 +include: _default_template_mmiasdbench_option_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmiasdbench_option
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmivqdbench_base"
2 +test_split: test
3 +include: _default_template_mmivqdbench_base_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmivqdbench_base
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmivqdbench_instruction"
2 +test_split: test
3 +include: _default_template_mmivqdbench_instruction_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmivqdbench_instruction
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1 +task: "mmivqdbench_option"
2 +test_split: test
3 +include: _default_template_mmivqdbench_option_yaml
4 +metric_list:
5 + - metric: gpt_eval_score
6 +aggregation: !function utils.mmivqdbench_option
7 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
1 +group: mmupdbench
2 +task:
3 + - mmaadbench_base
4 + - mmaadbench_option
5 + - mmaadbench_instruction
6 + - mmiasdbench_base
7 + - mmiasdbench_option
8 + - mmiasdbench_instruction
9 + - mmivqdbench_base
10 + - mmivqdbench_option
11 + - mmivqdbench_instruction
12 +metadata:
13 +version: 0.0
14 +sys_prompt: ""
15 +gpt_eval_model_name: "gpt-3.5-turbo-0613"
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 +group: mmupdbench_base
2 +task:
3 + - mmaadbench_base
4 + - mmiasdbench_base
5 + - mmivqdbench_base
6 +metadata:
7 +version: 0.0
8 +sys_prompt: ""
9 +gpt_eval_model_name: "gpt-3.5-turbo-0613"
10 +