Issues with Model Performance on Smaller Roboflow Datsets · Issue #214 · Peterande/D-FINE (original) (raw)

Hey @Peterande ! First of all, thank you so much for this interesting contribution.

We at Roboflow are trying to benchmark D-FINE's performance on smaller roboflow datasets, like those in Roboflow 100 and we are seeing sub 5% mAP as well as crashes for each dataset that we try, both when using the o365 weights and without.

We're using the finetuning config you have defined here and simply replacing the dataset link with a Roboflow dataset. We want to make sure we represent your work in the best light, so do you have any advice on how to get better metrics here? We notice that you recommended trying training from scratch for all the other people encountering similar issues, but I didn't see anyone respond that that worked for them, and we haven't seen it help much for us either.

Here's how we've been launching the benchmark. Please let us know if we're doing something obviously wrong!

import roboflow import os import json import subprocess import torch import fire import re

model_name_to_config_map = { "dfine_s": "configs/dfine/custom/objects365/dfine_hgnetv2_s_obj2custom.yml", "dfine_m": "configs/dfine/custom/objects365/dfine_hgnetv2_m_obj2custom.yml", "dfine_l": "configs/dfine/custom/objects365/dfine_hgnetv2_l_obj2custom.yml", "dfine_x": "configs/dfine/custom/objects365/dfine_hgnetv2_x_obj2custom.yml", } dataset_config_template = "configs/dataset/custom_detection.yml"

generated_model_config_base_dir = "configs/dfine/custom/roboflow" generated_dataset_config_base_dir = "configs/dataset/roboflow"

model_name_to_o365_checkpoint_map = { "dfine_s": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_s_obj365.pth", "dfine_m": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_m_obj365.pth", "dfine_l": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_l_obj365.pth", "dfine_x": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_x_obj365.pth", }

def train_on_roboflow_url(roboflow_url, model_name="dfine_s", output_dir="./output"): # load dataset and related info print(f"Downloading dataset from {roboflow_url}") dataset = roboflow.download_dataset(roboflow_url, "coco")

dataset_train_image_folder = os.path.join(dataset.location, "train")
dataset_train_annotation_file = os.path.join(dataset_train_image_folder, "_annotations.coco.json")

dataset_val_image_folder = os.path.join(dataset.location, "valid")

with open(dataset_train_annotation_file, "r") as f:
    train_annotations = json.load(f)

num_classes = len(train_annotations["categories"])

del train_annotations

# construct dataset config
print(f"Creating dataset config in {output_dir}")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(generated_dataset_config_base_dir, exist_ok=True)
os.makedirs(generated_model_config_base_dir, exist_ok=True)

with open(dataset_config_template, "r") as f:
    dataset_config = f.read()

dataset_config = dataset_config.replace("num_classes: 777 # your dataset classes", f"num_classes: {num_classes}")

dataset_config = dataset_config.replace("/data/yourdataset/train", dataset_train_image_folder)
dataset_config = dataset_config.replace("train.json", "_annotations.coco.json")

dataset_config = dataset_config.replace("/data/yourdataset/val", dataset_val_image_folder)
dataset_config = dataset_config.replace("val.json", "_annotations.coco.json")

dataset_filename = f"{dataset.name}_dfine_dataset_config.yml"
dataset_config_save_name = os.path.join(generated_dataset_config_base_dir, dataset_filename)

with open(dataset_config_save_name, "w") as f:
    f.write(dataset_config)

# construct model config
print(f"Creating model config in {output_dir}")
model_config = model_name_to_config_map[model_name]

with open(model_config, "r") as f:
    model_config = f.read()

# ensure the standardized bs and epochs
# model_config = model_config.replace("epochs: 64", "epochs: 1")
epochs = re.search(r"epochs: (\d+)", model_config).group(1)
model_config = model_config.replace(f"epochs: {epochs}", "epochs: 100")
# model_config = model_config.replace("total_batch_size: 128", "total_batch_size: 16")
# model_config = model_config.replace("total_batch_size: 256", "total_batch_size: 16")
model_config = model_config.replace("train_dataloader:", "train_dataloader:\n  total_batch_size: 16")
epoch = re.search(r"epoch: (\d+)", model_config).group(1)
model_config = model_config.replace(f"epoch: {epoch}", "epoch: 90")
stop_epoch = re.search(r"stop_epoch: (\d+)", model_config).group(1)
model_config = model_config.replace(f"stop_epoch: {stop_epoch}", "stop_epoch: 90")

model_config = model_config.replace("dataset/custom_detection.yml", os.path.join("..", dataset_config_save_name))

# model_config = model_config.replace("\'../", "\'../../")  # since the model config is in an extra subdir

train_output_dir = os.path.join(output_dir, f"{dataset.name}_{model_name}_train_output")
model_size = model_name.split("_")[1]
model_config = model_config.replace(f"output_dir: ./output/dfine_hgnetv2_{model_size}_obj2custom", f"output_dir: {train_output_dir}")

model_config_save_name = os.path.join(generated_model_config_base_dir, f"{dataset.name}_{model_name}_model_config.yml")

with open(model_config_save_name, "w") as f:
    f.write(model_config)

# train model
o365_checkpoint_url = model_name_to_o365_checkpoint_map[model_name]
o365_checkpoint_name = o365_checkpoint_url.split("/")[-1]
o365_checkpoint_path = os.path.join(output_dir, o365_checkpoint_name)

if not os.path.exists(o365_checkpoint_path):
    print(f"Downloading O365 checkpoint from {o365_checkpoint_url}")
    subprocess.run(["wget", o365_checkpoint_url, "-O", o365_checkpoint_path])

print(f"Training model in {train_output_dir}")
num_gpus = torch.cuda.device_count()
train_result = subprocess.run([
    "torchrun",
    "--nproc_per_node", str(num_gpus),
    "--rdzv_endpoint", "localhost:0",
    "--rdzv-backend", "c10d",
    "train.py",
    "-c", model_config_save_name,
    "--use-amp",
    "--seed=0",
    "-t", o365_checkpoint_path
])

# get test set performance
dataset_config = dataset_config.replace("valid", "test")

with open(dataset_config_save_name, "w") as f:
    f.write(dataset_config)

stg1_checkpoint_path = os.path.join(train_output_dir, "best_stg1.pth")
stg2_checkpoint_path = os.path.join(train_output_dir, "best_stg2.pth")

if os.path.exists(stg2_checkpoint_path):
    print(f"Testing with STG2 checkpoint {stg2_checkpoint_path}")
    checkpoint_path = stg2_checkpoint_path
elif os.path.exists(stg1_checkpoint_path):
    print(f"Testing with STG1 checkpoint {stg1_checkpoint_path}")
    checkpoint_path = stg1_checkpoint_path
else:
    raise ValueError(f"No checkpoint found in {train_output_dir}")

test_result = subprocess.run([
    "torchrun",
    "--nproc_per_node", str(num_gpus),
    "--rdzv_endpoint", "localhost:0",
    "--rdzv-backend", "c10d",
    "train.py",
    "-c", model_config_save_name,
    "--test-only",
    "-r", checkpoint_path
])

test_stats_pth = os.path.join(train_output_dir, "test_stats.pth")
test_stats = torch.load(test_stats_pth, weights_only=False)

results_json = {
    "model_name": model_name,
    "map": test_stats["coco_eval_bbox"][0],
    "map50": test_stats["coco_eval_bbox"][1],
    "url": roboflow_url,
}

results_json_pth = os.path.join(train_output_dir, "results.json")
with open(results_json_pth, "w") as f:
    json.dump(results_json, f, indent=2)

if name == "main": fire.Fire(train_on_roboflow_url)