add upd · EvolvingLMMs-Lab/lmms-eval@71401ba (original) (raw)

File tree

25 files changed

lines changed

lmms_eval/tasks/mmupdbench

25 files changed

lines changed

Original file line number	Diff line number	Diff line change
@@ -190,6 +190,19 @@ We also provide the raw data exported from Weights & Biases for the detailed res
190	190	- MMMU (mmmu)
191	191	- MMMU Validation (mmmu_val)
192	192	- MMMU Test (mmmu_test)
	193	+- MMUPDBench (mmupdbench)
	194	+- MMUPDBench Base (mmupdbench_base)
	195	+- MMAADBench Base (mmaadbench_base)
	196	+- MMIASDBench Base (mmiasdbench_base)
	197	+- MMIVQDBench Base (mmivqdbench_base)
	198	+- MMUPDBench Option (mmupdbench_option)
	199	+- MMAADBench Option (mmaadbench_option)
	200	+- MMIASDBench Option (mmiasdbench_option)
	201	+- MMIVQDBench Option (mmivqdbench_option)
	202	+- MMUPDBench Instruction (mmupdbench_instruction)
	203	+- MMAADBench Instruction (mmaadbench_instruction)
	204	+- MMIASDBench Instruction (mmiasdbench_instruction)
	205	+- MMIVQDBench Instruction (mmivqdbench_instruction)
193	206	- MMVet (mmvet)
194	207	- Multi-DocVQA (multidocvqa)
195	208	- Multi-DocVQA Validation (multidocvqa_val)

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\n"
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmaad_base
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmaad_base
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\nAnswer with the option's letter from the given choices directly."
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmaad_option
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\n"
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmiasd_base
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\nIf all the options are incorrect, answer \"F. None of the above\"."
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmiasd_base
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\nAnswer with the option's letter from the given choices directly."
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmiasd_option
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\n"
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmivqd_base
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\nIf the given image is irrelevant to the question, answer \"F. The image and question are irrelevant.\"."
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmivqd_base
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,23 @@
	1	+dataset_path: MM-UPD/MM-UPD
	2	+doc_to_target: "answer"
	3	+model_specific_prompt_kwargs:
	4	+ default:
	5	+ pre_prompt: ""
	6	+ post_prompt: "\nAnswer with the option's letter from the given choices directly."
	7	+doc_to_visual: !function utils.mmupdbench_doc_to_visual
	8	+doc_to_text: !function utils.mmupdbench_doc_to_text
	9	+doc_to_target: "answer"
	10	+process_results: !function utils.mmupdbench_process_results
	11	+model_specific_generation_kwargs:
	12	+ llava:
	13	+ image_aspect_ratio: original
	14	+output_type: generate_until
	15	+dataset_name: mmivqd_option
	16	+generation_kwargs:
	17	+ until:
	18	+ - "ASSISTANT:"
	19	+ max_new_tokens: 1024
	20	+ temperature: 0
	21	+ top_p: 0
	22	+ num_beams: 1
	23	+ do_sample: false

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmaadbench_base"
	2	+test_split: test
	3	+include: _default_template_mmaadbench_base_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmaadbench_base
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmaadbench_instruction"
	2	+test_split: test
	3	+include: _default_template_mmaadbench_instruction_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmaadbench_instruction
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmaadbench_option"
	2	+test_split: test
	3	+include: _default_template_mmaadbench_option_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmaadbench_option
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmiasdbench_base"
	2	+test_split: test
	3	+include: _default_template_mmiasdbench_base_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmiasdbench_base
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmiasdbench_instruction"
	2	+test_split: test
	3	+include: _default_template_mmiasdbench_instruction_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmiasdbench_instruction
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmiasdbench_option"
	2	+test_split: test
	3	+include: _default_template_mmiasdbench_option_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmiasdbench_option
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmivqdbench_base"
	2	+test_split: test
	3	+include: _default_template_mmivqdbench_base_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmivqdbench_base
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmivqdbench_instruction"
	2	+test_split: test
	3	+include: _default_template_mmivqdbench_instruction_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmivqdbench_instruction
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,7 @@
	1	+task: "mmivqdbench_option"
	2	+test_split: test
	3	+include: _default_template_mmivqdbench_option_yaml
	4	+metric_list:
	5	+ - metric: gpt_eval_score
	6	+aggregation: !function utils.mmivqdbench_option
	7	+higher_is_better: true

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,15 @@
	1	+group: mmupdbench
	2	+task:
	3	+ - mmaadbench_base
	4	+ - mmaadbench_option
	5	+ - mmaadbench_instruction
	6	+ - mmiasdbench_base
	7	+ - mmiasdbench_option
	8	+ - mmiasdbench_instruction
	9	+ - mmivqdbench_base
	10	+ - mmivqdbench_option
	11	+ - mmivqdbench_instruction
	12	+metadata:
	13	+version: 0.0
	14	+sys_prompt: ""
	15	+gpt_eval_model_name: "gpt-3.5-turbo-0613"

Original file line number	Diff line number	Diff line change
@@ -0,0 +1,10 @@
	1	+group: mmupdbench_base
	2	+task:
	3	+ - mmaadbench_base
	4	+ - mmiasdbench_base
	5	+ - mmivqdbench_base
	6	+metadata:
	7	+version: 0.0
	8	+sys_prompt: ""
	9	+gpt_eval_model_name: "gpt-3.5-turbo-0613"
	10	+