Merge pull request #95 from AtsuMiyai/new_task/upd · EvolvingLMMs-Lab/lmms-eval@8ee7848 (original) (raw)

17 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -197,6 +197,19 @@ We also provide the raw data exported from Weights & Biases for the detailed res
197 197 - MMMU (mmmu)
198 198 - MMMU Validation (mmmu_val)
199 199 - MMMU Test (mmmu_test)
200 +- MMUPD (mmupd)
201 +- MMUPD Base (mmupd_base)
202 +- MMAAD Base (mmaad_base)
203 +- MMIASD Base (mmiasd_base)
204 +- MMIVQD Base (mmivqd_base)
205 +- MMUPD Option (mmupd_option)
206 +- MMAAD Option (mmaad_option)
207 +- MMIASD Option (mmiasd_option)
208 +- MMIVQD Option (mmivqd_option)
209 +- MMUPD Instruction (mmupd_instruction)
210 +- MMAAD Instruction (mmaad_instruction)
211 +- MMIASD Instruction (mmiasd_instruction)
212 +- MMIVQD Instruction (mmivqd_instruction)
200 213 - MMVet (mmvet)
201 214 - Multi-DocVQA (multidocvqa)
202 215 - Multi-DocVQA Validation (multidocvqa_val)
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
1 +dataset_path: MM-UPD/MM-UPD
2 +doc_to_target: "answer"
3 +doc_to_visual: !function utils.mmupd_doc_to_visual
4 +doc_to_text: !function utils.mmupd_doc_to_text
5 +doc_to_target: "answer"
6 +process_results: !function utils.mmupd_process_results
7 +model_specific_generation_kwargs:
8 + llava:
9 + image_aspect_ratio: original
10 +output_type: generate_until
11 +generation_kwargs:
12 + until:
13 + - "ASSISTANT:"
14 + max_new_tokens: 1024
15 + temperature: 0
16 + top_p: 0
17 + num_beams: 1
18 + do_sample: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmaad_base"
2 +test_split: test
3 +dataset_name: mmaad_base
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\n"
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmaad_base
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmaad_instruction"
2 +test_split: test
3 +dataset_name: mmaad_base
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmaad_instruction
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmaad_option"
2 +test_split: test
3 +dataset_name: mmaad_option
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\nAnswer with the option's letter from the given choices directly."
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmaad_option
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmiasd_base"
2 +test_split: test
3 +dataset_name: mmiasd_base
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\n"
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmiasd_base
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmiasd_instruction"
2 +test_split: test
3 +dataset_name: mmiasd_base
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmiasd_instruction
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmiasd_option"
2 +test_split: test
3 +dataset_name: mmiasd_option
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\nAnswer with the option's letter from the given choices directly."
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmiasd_option
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmivqd_base"
2 +test_split: test
3 +dataset_name: mmivqd_base
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\n"
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmivqd_base
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmivqd_instruction"
2 +test_split: test
3 +dataset_name: mmivqd_base
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\nIf the given image is irrelevant to the question, answer \"F. The image and question are irrelevant.\"."
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmivqd_instruction
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
1 +task: "mmivqd_option"
2 +test_split: test
3 +dataset_name: mmivqd_option
4 +model_specific_prompt_kwargs:
5 +default:
6 +pre_prompt: ""
7 +post_prompt: "\nAnswer with the option's letter from the given choices directly."
8 +include: _default_template_mmupd_yaml
9 +metric_list:
10 + - metric: gpt_eval_score
11 +aggregation: !function utils.mmivqd_option
12 +higher_is_better: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
1 +group: mmupd
2 +task:
3 + - mmaad_base
4 + - mmaad_option
5 + - mmaad_instruction
6 + - mmiasd_base
7 + - mmiasd_option
8 + - mmiasd_instruction
9 + - mmivqd_base
10 + - mmivqd_option
11 + - mmivqd_instruction
12 +metadata:
13 +version: 0.0
14 +sys_prompt: ""
15 +gpt_eval_model_name: "gpt-3.5-turbo-0613"
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
1 +group: mmupd_base
2 +task:
3 + - mmaad_base
4 + - mmiasd_base
5 + - mmivqd_base
6 +metadata:
7 +version: 0.0
8 +sys_prompt: ""
9 +gpt_eval_model_name: "gpt-3.5-turbo-0613"
10 +