Use custom RT-DETR model for FoundationPose (original) (raw)

Fine, I figure it out. There is something that need to do with the custom model to adapt to the struction, data format, namely to re-construct the output and input of the custom model.
I use rt_detr from the original author in github to train the model, and transfer it to the onnx type by using the following code:

#!/usr/bin/env python3

# export_rtdetr_correct_format.py

import torch

import torch.nn as nn

from ultralytics import RTDETR

import numpy as np

import cv2

class RtDetrCorrectFormat(nn.Module):

“”“修正格式：将中心点+宽高转换为角点坐标”“”

def _ init_(self, model_path):

super()._ init_()

rtdetr = RTDETR(model_path)

self.model = rtdetr.model

self.num_classes = rtdetr.model.nc

print(f"类别数: {self.num_classes}")

def forward(self, images, orig_target_sizes):

“”"

    关键修正：模型输出是\[cx, cy, w, h\]，需要转换为\[x1, y1, x2, y2\]

    """

orig_target_sizes = orig_target_sizes.to(torch.int64)

# 1. 获取模型输出

outputs = self.model(images)

if isinstance(outputs, tuple):

outputs = outputs[0]

batch_size = outputs.shape[0]

# 2. 提取boxes - 假设格式是[cx, cy, w, h]

boxes_raw = outputs[…, :4] # [batch, 300, 4]

# 3. DEBUG: 打印原始boxes格式

debug = True

if debug and batch_size == 1:

print(f"DEBUG - 原始boxes[0, 0]: {boxes_raw[0, 0].tolist()}")

# 手动计算验证

cx, cy, w, h = boxes_raw[0, 0]

x1 = cx - w/2

y1 = cy - h/2

x2 = cx + w/2

y2 = cy + h/2

print(f"DEBUG - 转换后: [{x1:.1f}, {y1:.1f}, {x2:.1f}, {y2:.1f}]")

# 4. 将[cx, cy, w, h]转换为[x1, y1, x2, y2]

# boxes_raw形状: [batch, 300, 4] 其中4=[cx, cy, w, h]

cx = boxes_raw[…, 0:1] # 保持维度 [batch, 300, 1]

cy = boxes_raw[…, 1:2]

w = boxes_raw[…, 2:3]

h = boxes_raw[…, 3:4]

x1 = cx - w/2

y1 = cy - h/2

x2 = cx + w/2

y2 = cy + h/2

# 合并为[x1, y1, x2, y2]

boxes_corners = torch.cat([x1, y1, x2, y2], dim=-1)

# 5. 坐标缩放到像素坐标

img_h = orig_target_sizes[:, 0:1].unsqueeze(1).float()

img_w = orig_target_sizes[:, 1:2].unsqueeze(1).float()

scale_factor = torch.cat([img_w, img_h, img_w, img_h], dim=-1)

boxes_pixel = boxes_corners * scale_factor

# 6. 限制坐标范围

boxes_pixel = torch.clamp(boxes_pixel, min=0)

max_coords = torch.cat([img_w, img_h, img_w, img_h], dim=-1)

boxes_pixel = torch.min(boxes_pixel, max_coords)

# 7. 提取scores和labels

scores_all = outputs[…, 4:]

scores, labels = torch.max(scores_all, dim=-1)

# 8. 确保orig_target_sizes不被优化

dummy = (img_w.mean() + img_h.mean()) * 0.0

# 9. 返回正确格式

return (labels.to(torch.int64) + dummy.to(torch.int64),

boxes_pixel.to(torch.float32) + dummy,

scores.to(torch.float32) + dummy)

def test_with_visualization():

“”“用可视化验证转换”“”

print(“\n” + “=”*60)

print(“可视化验证格式转换…”)

# 加载图片

img = cv2.imread(“image2.png”)

original_h, original_w = img.shape[:2]

# 预处理

img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

img_resized = cv2.resize(img_rgb, (640, 640))

img_normalized = img_resized.astype(np.float32) / 255.0

img_tensor = torch.from_numpy(img_normalized).permute(2, 0, 1).unsqueeze(0)

# 创建模型

model = RtDetrCorrectFormat(“best.pt”)

model.eval()

# 推理

orig_target_sizes = torch.tensor([[original_h, original_w]], dtype=torch.int64)

with torch.no_grad():

labels, boxes, scores = model(img_tensor, orig_target_sizes)

# 找到最佳检测

best_idx = torch.argmax(scores[0]).item()

best_box = boxes[0, best_idx].tolist()

best_score = scores[0, best_idx].item()

print(f"\n转换后边界框: [{best_box[0]:.1f}, {best_box[1]:.1f}, {best_box[2]:.1f}, {best_box[3]:.1f}]")

width = best_box[2] - best_box[0]

height = best_box[3] - best_box[1]

print(f"宽高: {width:.1f}x{height:.1f}")

# 可视化

vis_img = img.copy()

x1, y1, x2, y2 = map(int, best_box)

# 绘制导出模型的检测框（绿色）

cv2.rectangle(vis_img, (x1, y1), (x2, y2), (0, 255, 0), 3)

label = f"Export: {width:.0f}x{height:.0f}"

cv2.putText(vis_img, label, (x1, y1-10),

cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

# 绘制离线模型的检测框（红色）用于对比

offline_box = [95.4, 11.7, 540.1, 309.5]

ox1, oy1, ox2, oy2 = map(int, offline_box)

cv2.rectangle(vis_img, (ox1, oy1), (ox2, oy2), (0, 0, 255), 2)

offline_label = f"Offline: 445x298"

cv2.putText(vis_img, offline_label, (ox1, oy1-40),

cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

# 显示

cv2.imshow(‘Format Correction Test (Green=Export, Red=Offline)’, vis_img)

print(“\n绿色框应该与红色框完全重合！”)

cv2.waitKey(0)

cv2.destroyAllWindows()

# 计算误差

error_x1 = abs(x1 - ox1)

error_y1 = abs(y1 - oy1)

error_x2 = abs(x2 - ox2)

error_y2 = abs(y2 - oy2)

print(f"\n坐标误差:")

print(f" x1误差: {error_x1} 像素")

print(f" y1误差: {error_y1} 像素")

print(f" x2误差: {error_x2} 像素")

print(f" y2误差: {error_y2} 像素")

if max(error_x1, error_y1, error_x2, error_y2) < 5:

print(“✓ 格式转换成功！”)

return True

else:

print(“⚠ 格式转换仍有误差”)

return False

def export_model():

“”“导出修正后的模型”“”

print(“导出RT-DETR（修正坐标格式）”)

print(“=”*60)

# 先验证转换

if not test_with_visualization():

print(“\n⚠ 格式转换验证失败，请检查模型输出格式”)

return

print(“\n” + “=”*60)

print(“导出ONNX模型…”)

# 创建模型

model = RtDetrCorrectFormat(“best.pt”)

model.eval()

# 准备导出输入

images = torch.randn(1, 3, 640, 640, dtype=torch.float32)

orig_target_sizes = torch.tensor([[480, 640]], dtype=torch.int64)

# 导出

torch.onnx.export(

model,

    (images, orig_target_sizes),

“best3.onnx”,

opset_version=16,

input_names=[‘images’, ‘orig_target_sizes’],

output_names=[‘labels’, ‘boxes’, ‘scores’],

dynamic_axes={

‘images’: {0: ‘batch_size’},

‘orig_target_sizes’: {0: ‘batch_size’},

‘labels’: {0: ‘batch_size’},

‘boxes’: {0: ‘batch_size’},

‘scores’: {0: ‘batch_size’}

    }

)

print(“✓ ONNX导出完成: best3.onnx”)

# 验证

verify_export()

def verify_export():

“”“验证导出结果”“”

import onnxruntime as ort

print(“\n验证导出模型…”)

session = ort.InferenceSession(“best3.onnx”)

# 用测试图片验证

img = cv2.imread(“image2.png”)

original_h, original_w = img.shape[:2]

img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

img_resized = cv2.resize(img_rgb, (640, 640))

img_normalized = img_resized.astype(np.float32) / 255.0

img_input = img_normalized.transpose(2, 0, 1)[np.newaxis, …]

orig_target_sizes_np = np.array([[original_h, original_w]], dtype=np.int64)

outputs = session.run(None, {

‘images’: img_input,

‘orig_target_sizes’: orig_target_sizes_np

})

labels, boxes, scores = outputs

best_idx = np.argmax(scores[0])

best_box = boxes[0, best_idx]

print(f"\nONNX模型输出:")

print(f" 最佳检测框: [{best_box[0]:.1f}, {best_box[1]:.1f}, {best_box[2]:.1f}, {best_box[3]:.1f}]")

width = best_box[2] - best_box[0]

height = best_box[3] - best_box[1]

print(f" 宽高: {width:.1f}x{height:.1f}")

# 检查是否匹配离线结果

expected_width = 444.7

expected_height = 297.8

width_match = abs(width - expected_width) < 2.0

height_match = abs(height - expected_height) < 2.0

if width_match and height_match:

print(f"✓ 完美匹配离线结果！")

else:

print(f"⚠ 仍有差异: 期望{expected_width:.1f}x{expected_height:.1f}, 实际{width:.1f}x{height:.1f}")

if _ name_ == “_ main_”:

print(“RT-DETR坐标格式修正工具”)

print(“将[cx,cy,w,h]转换为[x1,y1,x2,y2]”)

print(“=”*60)

export_model()

print(“\n” + “=”*60)

print(“转换TensorRT命令:”)

print(“”"

/usr/src/tensorrt/bin/trtexec \\

–onnx=best3.onnx \\

–saveEngine=best3.plan \\

–explicitBatch \\

–minShapes=images:1x3x640x640,orig_target_sizes:1x2 \\

–optShapes=images:1x3x640x640,orig_target_sizes:1x2 \\

–maxShapes=images:4x3x640x640,orig_target_sizes:4x2 \\

–workspace=2048 \\

–fp16

""")

It worked after the onnx model converted to the plan model. BTW, the ImageNormalizeNode should also be added to the launch file to normalize the image.

The key point is to re-contruct the trained model to satisfy the pipeline in isaac_ros_rtdetr.