[Feat] PyTorch - VIP backbone and VIPTR recognition module by lkosh · Pull Request #1912 · mindee/doctr (original) (raw)

I corrected the code according to your comments - added recognition config, changed typing, deleted empty tensorflow files, etc. Regarding vip_tiny and vip_base - I checked that the input parameters match functions VIPTRv2 and VIPTRv2B of the official implementation, assuming that's the variant you meant to implement :) I've started training again, hopefully it works better now :)

Looks already better 👍

But one thing should be reverted

You removed the include_top logic from the classfication model that was fine because in this case we can use it also as classification model and later for recognition we do not include the classification head that we import the classification model as feature extractor (without linear head) and can further build on top of it - In the case of VIPTR it's only another linear layer

See here:

snippet from vit (classification & backbone for vitstr)

class ClassifierHead(nn.Module):
    """Classifier head for Vision Transformer

    Args:
        in_channels: number of input channels
        num_classes: number of output classes
    """

    def __init__(
        self,
        in_channels: int,
        num_classes: int,
    ) -> None:
        super().__init__()

        self.head = nn.Linear(in_channels, num_classes)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # (batch_size, num_classes) cls token
        return self.head(x[:, 0])


class VisionTransformer(nn.Sequential):
    """VisionTransformer architecture as described in
    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale",
    <https://arxiv.org/pdf/2010.11929.pdf>`_.

    Args:
        d_model: dimension of the transformer layers
        num_layers: number of transformer layers
        num_heads: number of attention heads
        ffd_ratio: multiplier for the hidden dimension of the feedforward layer
        patch_size: size of the patches
        input_shape: size of the input image
        dropout: dropout rate
        num_classes: number of output classes
        include_top: whether the classifier head should be instantiated
    """

    def __init__(
        self,
        d_model: int,
        num_layers: int,
        num_heads: int,
        ffd_ratio: int,
        patch_size: tuple[int, int] = (4, 4),
        input_shape: tuple[int, int, int] = (3, 32, 32),
        dropout: float = 0.0,
        num_classes: int = 1000,
        include_top: bool = True,
        cfg: dict[str, Any] | None = None,
    ) -> None:
        _layers: list[nn.Module] = [
            PatchEmbedding(input_shape, d_model, patch_size),
            EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, nn.GELU()),
        ]
        if include_top:
            _layers.append(ClassifierHead(d_model, num_classes))

        super().__init__(*_layers)
        self.cfg = cfg

and on recognition side (vitstr)

def _vitstr(
  arch: str,
  pretrained: bool,
  backbone_fn: Callable[[bool], nn.Module],
  layer: str,
  ignore_keys: list[str] | None = None,
  **kwargs: Any,
) -> ViTSTR:
  # Patch the config
  _cfg = deepcopy(default_cfgs[arch])
  _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"])
  _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"])
  patch_size = kwargs.get("patch_size", (4, 8))

  kwargs["vocab"] = _cfg["vocab"]
  kwargs["input_shape"] = _cfg["input_shape"]

  # Feature extractor
  feat_extractor = IntermediateLayerGetter(
      # NOTE: we don't use a pretrained backbone for non-rectangular patches to avoid the pos embed mismatch
      backbone_fn(False, input_shape=_cfg["input_shape"], patch_size=patch_size),  # type: ignore[call-arg]
      {layer: "features"},
  )

  kwargs.pop("patch_size", None)
  kwargs.pop("pretrained_backbone", None)

  # Build the model
  model = ViTSTR(feat_extractor, cfg=_cfg, **kwargs)
  # Load pretrained parameters
  if pretrained:
      # The number of classes is not the same as the number of classes in the pretrained model =>
      # remove the last layer weights
      _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
      load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)

  return model


def vitstr_small(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
  """ViTSTR-Small as described in `"Vision Transformer for Fast and Efficient Scene Text Recognition"
  <https://arxiv.org/pdf/2105.08582.pdf>`_.

  >>> import torch
  >>> from doctr.models import vitstr_small
  >>> model = vitstr_small(pretrained=False)
  >>> input_tensor = torch.rand((1, 3, 32, 128))
  >>> out = model(input_tensor)

  Args:
      pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
      kwargs: keyword arguments of the ViTSTR architecture

  Returns:
      text recognition architecture
  """
  return _vitstr(
      "vitstr_small",
      pretrained,
      vit_s,
      "1",
      embedding_units=384,
      patch_size=(4, 8),
      ignore_keys=["head.weight", "head.bias"],
      **kwargs,
  )

In this case we use the IntermediateLayerGetter to get the whole model but exclude the head - that's the preferred way or option 2 - this should work also for your model


def _viptr(
    arch: str,
    pretrained: bool,
    backbone_fn: Callable[[bool], nn.Module],
    layer: str,
    pretrained_backbone: bool = True,
    ignore_keys: list[str] | None = None,
    **kwargs: Any,
) -> VIPTR:
    pretrained_backbone = pretrained_backbone and not pretrained
    
    ...
   feature_extractor = vip_tiny(pretrained_backbone, include_top=False)
   ...

If you want to use the IntermediateLayerGetter

from doctr.models import viptr_tiny

print(viptr_tiny(pretrained=False)

This will print the model structure where you can see which key you need to extract :)