[Feat] PyTorch - VIP backbone and VIPTR recognition module by lkosh · Pull Request #1912 · mindee/doctr (original) (raw)
I corrected the code according to your comments - added recognition config, changed typing, deleted empty tensorflow files, etc. Regarding vip_tiny and vip_base - I checked that the input parameters match functions VIPTRv2 and VIPTRv2B of the official implementation, assuming that's the variant you meant to implement :) I've started training again, hopefully it works better now :)
Looks already better 👍
But one thing should be reverted
You removed the include_top logic from the classfication model that was fine because in this case we can use it also as classification model and later for recognition we do not include the classification head that we import the classification model as feature extractor (without linear head) and can further build on top of it - In the case of VIPTR it's only another linear layer
See here:
snippet from vit (classification & backbone for vitstr)
class ClassifierHead(nn.Module):
"""Classifier head for Vision Transformer
Args:
in_channels: number of input channels
num_classes: number of output classes
"""
def __init__(
self,
in_channels: int,
num_classes: int,
) -> None:
super().__init__()
self.head = nn.Linear(in_channels, num_classes)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# (batch_size, num_classes) cls token
return self.head(x[:, 0])
class VisionTransformer(nn.Sequential):
"""VisionTransformer architecture as described in
`"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
<https://arxiv.org/pdf/2010.11929.pdf>`_.
Args:
d_model: dimension of the transformer layers
num_layers: number of transformer layers
num_heads: number of attention heads
ffd_ratio: multiplier for the hidden dimension of the feedforward layer
patch_size: size of the patches
input_shape: size of the input image
dropout: dropout rate
num_classes: number of output classes
include_top: whether the classifier head should be instantiated
"""
def __init__(
self,
d_model: int,
num_layers: int,
num_heads: int,
ffd_ratio: int,
patch_size: tuple[int, int] = (4, 4),
input_shape: tuple[int, int, int] = (3, 32, 32),
dropout: float = 0.0,
num_classes: int = 1000,
include_top: bool = True,
cfg: dict[str, Any] | None = None,
) -> None:
_layers: list[nn.Module] = [
PatchEmbedding(input_shape, d_model, patch_size),
EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, nn.GELU()),
]
if include_top:
_layers.append(ClassifierHead(d_model, num_classes))
super().__init__(*_layers)
self.cfg = cfg
and on recognition side (vitstr)
def _vitstr(
arch: str,
pretrained: bool,
backbone_fn: Callable[[bool], nn.Module],
layer: str,
ignore_keys: list[str] | None = None,
**kwargs: Any,
) -> ViTSTR:
# Patch the config
_cfg = deepcopy(default_cfgs[arch])
_cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"])
_cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"])
patch_size = kwargs.get("patch_size", (4, 8))
kwargs["vocab"] = _cfg["vocab"]
kwargs["input_shape"] = _cfg["input_shape"]
# Feature extractor
feat_extractor = IntermediateLayerGetter(
# NOTE: we don't use a pretrained backbone for non-rectangular patches to avoid the pos embed mismatch
backbone_fn(False, input_shape=_cfg["input_shape"], patch_size=patch_size), # type: ignore[call-arg]
{layer: "features"},
)
kwargs.pop("patch_size", None)
kwargs.pop("pretrained_backbone", None)
# Build the model
model = ViTSTR(feat_extractor, cfg=_cfg, **kwargs)
# Load pretrained parameters
if pretrained:
# The number of classes is not the same as the number of classes in the pretrained model =>
# remove the last layer weights
_ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
return model
def vitstr_small(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
"""ViTSTR-Small as described in `"Vision Transformer for Fast and Efficient Scene Text Recognition"
<https://arxiv.org/pdf/2105.08582.pdf>`_.
>>> import torch
>>> from doctr.models import vitstr_small
>>> model = vitstr_small(pretrained=False)
>>> input_tensor = torch.rand((1, 3, 32, 128))
>>> out = model(input_tensor)
Args:
pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
kwargs: keyword arguments of the ViTSTR architecture
Returns:
text recognition architecture
"""
return _vitstr(
"vitstr_small",
pretrained,
vit_s,
"1",
embedding_units=384,
patch_size=(4, 8),
ignore_keys=["head.weight", "head.bias"],
**kwargs,
)
In this case we use the IntermediateLayerGetter to get the whole model but exclude the head - that's the preferred way or option 2 - this should work also for your model
def _viptr(
arch: str,
pretrained: bool,
backbone_fn: Callable[[bool], nn.Module],
layer: str,
pretrained_backbone: bool = True,
ignore_keys: list[str] | None = None,
**kwargs: Any,
) -> VIPTR:
pretrained_backbone = pretrained_backbone and not pretrained
...
feature_extractor = vip_tiny(pretrained_backbone, include_top=False)
...
If you want to use the IntermediateLayerGetter
from doctr.models import viptr_tiny
print(viptr_tiny(pretrained=False)
This will print the model structure where you can see which key you need to extract :)