From d9b56ca9a0ed747510c875da13c3fb66988984d4 Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Wed, 17 May 2023 20:27:25 +0800 Subject: [PATCH 1/7] feat(BeitV2):add finetune and pretrain code --- ppcls/arch/__init__.py | 22 +- ppcls/arch/backbone/__init__.py | 3 + .../backbone/model_zoo/modeling_finetune.py | 674 ++++++++++++++++++ .../backbone/model_zoo/modeling_pretrain.py | 364 ++++++++++ .../backbone/model_zoo/norm_ema_quantizer.py | 242 +++++++ ppcls/arch/backbone/model_zoo/vqkd.py | 308 ++++++++ .../BeitV2/BeitV2_base_patch16_224_ft.yaml | 170 +++++ .../BeitV2/BeitV2_base_patch16_224_pt.yaml | 165 +++++ ppcls/data/__init__.py | 1 + ppcls/data/dataloader/beitdataset.py | 63 ++ ppcls/data/preprocess/__init__.py | 5 +- .../data/preprocess/ops/masking_generator.py | 118 +++ .../ops/random_crop_and_interpolation.py | 285 ++++++++ ppcls/data/preprocess/ops/random_erasing.py | 89 ++- .../data/preprocess/ops/transforms_factory.py | 236 ++++++ ppcls/engine/engine.py | 6 +- ppcls/engine/train/train.py | 6 +- ppcls/loss/__init__.py | 2 +- ppcls/loss/distillationloss.py | 30 + ppcls/optimizer/optimizer.py | 83 ++- 20 files changed, 2862 insertions(+), 10 deletions(-) create mode 100644 ppcls/arch/backbone/model_zoo/modeling_finetune.py create mode 100644 ppcls/arch/backbone/model_zoo/modeling_pretrain.py create mode 100644 ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py create mode 100644 ppcls/arch/backbone/model_zoo/vqkd.py create mode 100644 ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml create mode 100644 ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml create mode 100644 ppcls/data/dataloader/beitdataset.py create mode 100644 ppcls/data/preprocess/ops/masking_generator.py create mode 100644 ppcls/data/preprocess/ops/random_crop_and_interpolation.py create mode 100644 ppcls/data/preprocess/ops/transforms_factory.py diff --git a/ppcls/arch/__init__.py b/ppcls/arch/__init__.py index 0379edb4d4..3990197b04 100644 --- a/ppcls/arch/__init__.py +++ b/ppcls/arch/__init__.py @@ -14,6 +14,7 @@ import copy import importlib +import paddle import paddle.nn as nn from paddle.jit import to_static from paddle.static import InputSpec @@ -28,7 +29,7 @@ from .slim import prune_model, quantize_model from .distill.afd_attention import LinearTransformStudent, LinearTransformTeacher -__all__ = ["build_model", "RecModel", "DistillationModel", "AttentionModel"] +__all__ = ["build_model", "RecModel", "DistillationModel", "AttentionModel", "Beitv2Model"] def build_model(config, mode="train"): @@ -168,4 +169,23 @@ def forward(self, x, label=None): else: out = self.model_list[idx](out, label) result_dict.update(out) + return result_dict + +class Beitv2Model(DistillationModel): + def __init__(self, + models=None, + pretrained_list=None, + freeze_params_list=None, + **kargs): + super().__init__(models, pretrained_list, freeze_params_list, **kargs) + def forward(self, samples, images, bool_masked): + result_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + bool_masked_pos = bool_masked.flatten(1).astype(paddle.bool) + if model_name == "Teacher": + with paddle.no_grad(): + input_ids = self.model_list[idx].get_codebook_indices(images) + result_dict[model_name] = input_ids[bool_masked_pos] + else: + result_dict[model_name] = self.model_list[idx](samples, bool_masked_pos) return result_dict \ No newline at end of file diff --git a/ppcls/arch/backbone/__init__.py b/ppcls/arch/backbone/__init__.py index 78e9b4dc25..9dc12bd3dd 100644 --- a/ppcls/arch/backbone/__init__.py +++ b/ppcls/arch/backbone/__init__.py @@ -75,6 +75,9 @@ from .model_zoo.convnext import ConvNeXt_tiny, ConvNeXt_small, ConvNeXt_base_224, ConvNeXt_base_384, ConvNeXt_large_224, ConvNeXt_large_384 from .model_zoo.nextvit import NextViT_small_224, NextViT_base_224, NextViT_large_224, NextViT_small_384, NextViT_base_384, NextViT_large_384 from .model_zoo.cae import cae_base_patch16_224, cae_large_patch16_224 +from .model_zoo.vqkd import vqkd_encoder_base_decoder_3x768x12_clip +from .model_zoo.modeling_pretrain import beit_base_patch16_224_8k_vocab_cls_pt +from .model_zoo.modeling_finetune import beit_base_patch16_224 from .variant_models.resnet_variant import ResNet50_last_stage_stride1 from .variant_models.resnet_variant import ResNet50_adaptive_max_pool2d diff --git a/ppcls/arch/backbone/model_zoo/modeling_finetune.py b/ppcls/arch/backbone/model_zoo/modeling_finetune.py new file mode 100644 index 0000000000..a6b12cf761 --- /dev/null +++ b/ppcls/arch/backbone/model_zoo/modeling_finetune.py @@ -0,0 +1,674 @@ +import math +from functools import partial +import numpy as np +from collections.abc import Callable + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from scipy import interpolate + +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +from collections import OrderedDict +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + **kwargs + } + + +trunc_normal_ = TruncatedNormal(std=.02) +zeros_ = Constant(value=0.) +ones_ = Constant(value=1.) + +def to_2tuple(x): + return tuple([x] * 2) + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob) + shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class Mlp(nn.Layer): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the original BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Attention(nn.Layer): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., window_size=None, attn_head_dim=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias_attr=False) + if qkv_bias: + self.q_bias = self.create_parameter(shape=(all_head_dim,), default_initializer=zeros_) + self.add_parameter("q_bias", self.q_bias) + self.v_bias = self.create_parameter(shape=(all_head_dim,), default_initializer=zeros_) + self.add_parameter("v_bias", self.v_bias) + else: + self.q_bias = None + self.v_bias = None + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = self.create_parameter(shape=(self.num_relative_distance, num_heads), default_initializer=ones_) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + self.add_parameter("relative_position_bias_table", self.relative_position_bias_table) + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + + coords_flatten_1 = coords_flatten.unsqueeze(axis=2) + coords_flatten_2 = coords_flatten.unsqueeze(axis=1) + relative_coords = coords_flatten_1 - coords_flatten_2 + relative_coords = relative_coords.transpose([1, 2, 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + paddle.zeros([(window_size[0] * window_size[1] + 1), ]*2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", + relative_position_index) + + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + if self.relative_position_bias_table is not None: + trunc_normal_(self.relative_position_bias_table) + self.softmax = nn.Softmax(axis=-1) + + def forward(self, x, rel_pos_bias=None, return_attention=False, return_qkv=False): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = paddle.concat([self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias]) + qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape([B, N, 3, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4]) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q * self.scale + attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) + + if self.relative_position_bias_table is not None: + index = self.relative_position_index.reshape([-1]) + + relative_position_bias = paddle.index_select( + self.relative_position_bias_table, index) + relative_position_bias = relative_position_bias.reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1 + ]) # Wh*Ww,Wh*Ww,nH + + relative_position_bias = relative_position_bias.transpose( + [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = self.softmax(attn) + attn = self.attn_drop(attn) + + if return_attention: + return attn + + x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C]) + x = self.proj(x) + x = self.proj_drop(x) + + if return_qkv: + return x, qkv + + return x + + +class Block(nn.Layer): + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer="nn.LayerNorm", + window_size=None, attn_head_dim=None): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=1e-6) + elif isinstance(norm_layer, Callable): + self.norm1 = norm_layer(dim) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=1e-6) + elif isinstance(norm_layer, Callable): + self.norm2 = norm_layer(dim) + else: + raise TypeError( + "The norm_layer must be str or paddle.nn.layer.Layer class") + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if init_values > 0: + x = init_values * paddle.ones([dim], dtype="float32") + self.gamma_1 = self.create_parameter(shape=(dim,), default_initializer=ones_) + self.add_parameter("gamma_1", self.gamma_1) + self.gamma_2 = self.create_parameter(shape=(dim,), default_initializer=ones_) + self.add_parameter("gamma_2", self.gamma_2) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None, return_attention=False, return_qkv=False): + if return_attention: + return self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, return_attention=True) + if return_qkv: + y, qkv = self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, return_attention=return_qkv) + x = x + self.drop_path(self.gamma_1 * y) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x, qkv + + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * \ + (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2D( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x, **kwargs): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + + x = self.proj(x).flatten(2).transpose((0, 2, 1)) + return x + + +class RelativePositionBias(nn.Layer): + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = \ + self.create_parameter(shape=(self.num_relative_distance, num_heads), default_initializer=ones_) # 2*Wh-1 * 2*Ww-1, nH + self.add_parameter("relative_position_bias_table", self.relative_position_bias_table) + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = paddle.arange(window_size[0]) + coords_w = paddle.arange(window_size[1]) + coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww + + coords_flatten_1 = coords_flatten.unsqueeze(axis=2) + coords_flatten_2 = coords_flatten.unsqueeze(axis=1) + relative_coords = coords_flatten_1 - coords_flatten_2 + relative_coords = relative_coords.transpose([1, 2, 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + paddle.zeros([(window_size[0] * window_size[1] + 1), ]*2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", + relative_position_index) + + def forward(self, ): + index = self.relative_position_index.reshape([-1]) + relative_position_bias = paddle.index_select( + self.relative_position_bias_table, index) + relative_position_bias = relative_position_bias.reshape([ + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1 + ]) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.transpose([2, 0, 1]) # nH, Wh*Ww, Wh*Ww # nH, Wh*Ww, Wh*Ww + + + +class VisionTransformer(nn.Layer): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer='nn.LayerNorm', init_values=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, + use_mean_pooling=True, init_scale=0.001): + super().__init__() + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + self.add_parameter("cls_token", self.cls_token) + # self.mask_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + if use_abs_pos_emb: + self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_) + self.add_parameter("pos_embed", self.pos_embed) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = np.linspace(0, drop_path_rate, depth, dtype=np.float32) + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None) + for i in range(depth)]) + self.norm = Identity() if use_mean_pooling else eval(norm_layer)(embed_dim, epsilon=1e-6) + self.fc_norm = eval(norm_layer)(embed_dim, epsilon=1e-6) if use_mean_pooling else None + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else Identity() + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed) + trunc_normal_(self.cls_token) + # trunc_normal_(self.mask_token) + if isinstance(self.head, nn.Linear): + trunc_normal_(self.head.weight) + self.apply(self._init_weights) + self.fix_init_weight() + + if isinstance(self.head, nn.Linear): + x = self.head.weight.multiply(paddle.to_tensor([init_values])) + self.head.weight = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + x = self.head.bias.multiply(paddle.to_tensor([init_values])) + self.head.bias = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def fix_init_weight(self): + def rescale(param, layer_id): + x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) + param = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, layer_id + 1) + rescale(layer.mlp.fc2.weight, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def get_num_layers(self): + return len(self.blocks) + + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else Identity() + + def interpolate_pos_encoding(self, x, w, h): + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + class_pos_embed = self.pos_embed[:, 0] + patch_pos_embed = self.pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_embed.patch_size[0] + h0 = h // self.patch_embed.patch_size[0] + + w0, h0 = w0 + 0.1, h0 + 0.1 + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).transpose([0, 3, 1, 2]), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode='bicubic', + ) + assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.transpose([0, 2, 3, 1]).reshape(1, -1, dim) + return paddle.concat([class_pos_embed.unsqueeze(0), patch_pos_embed], dim=1) + + def forward_features(self, x, return_patch_tokens=False, return_all_tokens=False, **kwargs): + B, nc, w, h = x.shape + x = self.patch_embed(x) + batch_size, seq_len, _ = x.shape + cls_tokens = self.cls_token.expand((B, -1, -1)) + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + if x.shape[1] != self.pos_embed.shape[1]: + x = x + self.interpolate_pos_encoding(x, w, h) + else: + x = x + self.pos_embed + + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + + x = self.norm(x) + if self.fc_norm is not None: + if return_all_tokens: + return self.fc_norm(x) + t = x[:, 1:, :] + if return_patch_tokens: + return self.fc_norm(t) + else: + return self.fc_norm(t.mean(1)) + else: + if return_all_tokens: + return x + elif return_patch_tokens: + return x[:, 1:] + else: + return x[:, 0] + + def forward(self, x, return_patch_tokens=False, return_all_tokens=False, **kwargs): + x = self.forward_features(x, return_patch_tokens=return_patch_tokens, return_all_tokens=return_all_tokens, **kwargs) + x = self.head(x) + return x + + def forward_intermediate(self, x, layer_id=12, norm_output=False): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = paddle.concat([cls_tokens, x], axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + if isinstance(layer_id, list): + output_list = [] + for l, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + # use last norm for all intermediate layers + if l in layer_id: + if norm_output: + x_norm = self.fc_norm(self.norm(x[:, 1:])) + output_list.append(x_norm) + else: + output_list.append(x[:, 1:]) + return output_list + elif isinstance(layer_id, int): + for l, blk in enumerate(self.blocks): + if l < layer_id: + x = blk(x, rel_pos_bias=rel_pos_bias) + elif l == layer_id: + x = blk.norm1(x) + else: + break + return x[:, 1:] + else: + raise NotImplementedError(f"Not support for layer id is {layer_id} now!") + + def get_intermediate_layers(self, x, use_last_norm=False): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = paddle.concat([cls_tokens, x], axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + features = [] + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for blk in self.blocks: + x = blk(x, rel_pos_bias) + if use_last_norm: + features.append(self.norm(x)) + else: + features.append(x) + + return features + + + +def beit_base_patch16_224(pretrained=False, finetune_weight=None, model_filter_name='', **kwargs): + model = VisionTransformer( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, # qkv_bias=True, + norm_layer="nn.LayerNorm", **kwargs) + if finetune_weight is not None: + checkpoint = paddle.load(finetune_weight) + + print("Load ckpt from %s" % finetune_weight) + checkpoint_model = checkpoint + if (checkpoint_model is not None) and (model_filter_name != ''): + all_keys = list(checkpoint_model.keys()) + new_dict = OrderedDict() + for key in all_keys: + if key.startswith('encoder.'): + new_dict[key[8:]] = checkpoint_model[key] + else: + pass + checkpoint_model = new_dict + state_dict = model.state_dict() + for k in ['head.weight', 'head.bias']: + if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: + print(f"Removing key {k} from pretrained checkpoint") + del checkpoint_model[k] + + if getattr(model, 'use_rel_pos_bias', False) and "rel_pos_bias.relative_position_bias_table" in checkpoint_model: + print("Expand the shared relative position embedding to each transformer block. ") + num_layers = model.get_num_layers() + rel_pos_bias = checkpoint_model["rel_pos_bias.relative_position_bias_table"] + for i in range(num_layers): + checkpoint_model["blocks.%d.attn.relative_position_bias_table" % i] = rel_pos_bias.clone() + + checkpoint_model.pop("rel_pos_bias.relative_position_bias_table") + + all_keys = list(checkpoint_model.keys()) + for key in all_keys: + if "relative_position_index" in key: + checkpoint_model.pop(key) + + if "relative_position_bias_table" in key: + rel_pos_bias = checkpoint_model[key] + src_num_pos, num_attn_heads = rel_pos_bias.shape + dst_num_pos, _ = model.state_dict()[key].shape + dst_patch_shape = model.patch_embed.patch_shape + if dst_patch_shape[0] != dst_patch_shape[1]: + raise NotImplementedError() + num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1) + src_size = int((src_num_pos - num_extra_tokens) ** 0.5) + dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) + + if src_size != dst_size: + print("Position interpolate for %s from %dx%d to %dx%d" % ( + key, src_size, src_size, dst_size, dst_size)) + extra_tokens = rel_pos_bias[-num_extra_tokens:, :] + rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + + def geometric_progression(a, r, n): + return a * (1.0 - r ** n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + # if q > 1.090307: + # q = 1.090307 + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q ** (i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + print("Original positions = %s" % str(x)) + print("Target positions = %s" % str(dx)) + + all_rel_pos_bias = [] + + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append( + paddle.to_tensor(f(dx, dy), place=rel_pos_bias.place).reshape([-1, 1])) + + rel_pos_bias = paddle.concat(all_rel_pos_bias, axis=-1) + + new_rel_pos_bias = paddle.concat([rel_pos_bias, extra_tokens], axis=0) + checkpoint_model[key] = new_rel_pos_bias + # interpolate position embedding + if ('pos_embed' in checkpoint_model) and (model.pos_embed is not None): + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = paddle.nn.functional.interpolate( + pos_tokens, size=[new_size, new_size], mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = paddle.concat([extra_tokens, pos_tokens], axis=1) + checkpoint_model['pos_embed'] = new_pos_embed + model.set_dict(checkpoint_model) + model.default_cfg = _cfg() + return model \ No newline at end of file diff --git a/ppcls/arch/backbone/model_zoo/modeling_pretrain.py b/ppcls/arch/backbone/model_zoo/modeling_pretrain.py new file mode 100644 index 0000000000..b4153bcba4 --- /dev/null +++ b/ppcls/arch/backbone/model_zoo/modeling_pretrain.py @@ -0,0 +1,364 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Code was heavily based on https://github.com/facebookresearch/deit +# reference: https://arxiv.org/abs/2012.12877 + +import math +import numpy as np +import paddle +import paddle.nn as nn + +from paddle.nn.initializer import TruncatedNormal, Constant, Normal +from .modeling_finetune import Block, PatchEmbed, RelativePositionBias, _cfg, zeros_, ones_, Identity +trunc_normal_ = TruncatedNormal(std=.02) + + +class VisionTransformerForMaskedImageModeling(nn.Layer): + def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02): + super().__init__() + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + self.num_heads = num_heads + + self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + self.add_parameter("cls_token", self.cls_token) + self.mask_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + self.add_parameter("mask_token", self.mask_token) + if use_abs_pos_emb: + self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_) + self.add_parameter("pos_embed", self.pos_embed) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = np.linspace(0, drop_path_rate, depth, dtype=np.float32) + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + attn_head_dim=attn_head_dim) + for i in range(depth) + ]) + self.norm = eval(norm_layer)(embed_dim, epsilon=1e-6) + + self.init_std = init_std + self.lm_head = nn.Linear(embed_dim, vocab_size) + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed) + trunc_normal_(self.cls_token) + trunc_normal_(self.mask_token) + trunc_normal_(self.lm_head.weight) + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) + param = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, layer_id + 1) + rescale(layer.mlp.fc2.weight, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_num_layers(self): + return len(self.blocks) + + def forward_features(self, x, bool_masked_pos): + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + + x = self.norm(x) + return x + + + def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.forward_features(x, bool_masked_pos=bool_masked_pos) + x = x[:, 1:] + if return_patch_tokens: + return x + if return_all_tokens: + return self.lm_head(x) + else: + # return the masked tokens + return self.lm_head(x[bool_masked_pos]) + + def forward_return_qkv(self, x, bool_masked_pos=None, split_out_as_qkv=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x, rel_pos_bias=rel_pos_bias) + else: + x, qkv = blk(x, rel_pos_bias=rel_pos_bias, return_qkv=True) + + if split_out_as_qkv: + x = self.norm(x) + x = self.lm_head(x) + q, k, v = x.chunks(3, axis=-1) + b, n, c = q.shape + q = q.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + k = k.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + v = v.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + return x, q, k, v + else: + x = self.norm(x) + x = x[:, 1:] + x = self.lm_head(x[bool_masked_pos]) + + q, k, v = qkv[0], qkv[1], qkv[2] + + return x, q, k, v + + def forward_intermediate(self, x, bool_masked_pos=None, layer_id=12): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + if isinstance(layer_id, list): + output_list = [] + for l, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + if l in layer_id: + output_list.append(x[:, 1:]) + return output_list + elif isinstance(layer_id, int): + for l, blk in enumerate(self.blocks): + if l < layer_id: + x = blk(x, rel_pos_bias=rel_pos_bias) + elif l == layer_id: + x = blk.norm1(x) + else: + break + return x[:, 1:] + else: + raise NotImplementedError(f"Not support for layer id is {layer_id} now!") + + def get_last_selfattention(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.shape + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.patch_embed + x = self.pos_drop(x) + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x, rel_pos_bias=rel_pos_bias) + else: + # return attention of the last block + return blk(x, rel_pos_bias=rel_pos_bias, return_attention=True) + + +class VisionTransformerForMaskedImageModelingCLS(VisionTransformerForMaskedImageModeling): + def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02, + early_layers=6, head_layers=2, shared_lm_head=True): + super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, vocab_size=vocab_size, embed_dim=embed_dim, depth=depth, + num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, attn_head_dim=attn_head_dim, + use_abs_pos_emb=use_abs_pos_emb, use_rel_pos_bias=use_rel_pos_bias, use_shared_rel_pos_bias=use_shared_rel_pos_bias, init_std=init_std) + + self.early_layers = early_layers + print(f'early layer {early_layers}, late layer {depth - early_layers}, condenser head layers {head_layers}, shared_lm_head {shared_lm_head}') + + dpr = np.linspace(0, drop_path_rate, max(depth, early_layers + head_layers), dtype=np.float32) + self.cls_pt_layers = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + attn_head_dim=attn_head_dim) + for i in range(early_layers, early_layers + head_layers) + ]) + self.fix_init_cls_pt_weight() + + self.shared_lm_head = shared_lm_head + if not self.shared_lm_head: + self.cls_pt_norm = norm_layer(embed_dim) + self.cls_pt_lm_head = nn.Linear(embed_dim, vocab_size) + + self.cls_pt_norm.apply(self._init_weights) + self.cls_pt_lm_head.apply(self._init_weights) + + def fix_init_cls_pt_weight(self): + def rescale(param, layer_id): + x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) + param = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, self.early_layers + layer_id + 1) + rescale(layer.mlp.fc2.weight, self.early_layers + layer_id + 1) + + def forward_features(self, x, bool_masked_pos): + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + if i + 1 == self.early_layers: + early_states = x[:, 1:] + + x_cls_pt = paddle.concat((x[:, 0].unsqueeze(1), early_states), axis=1) + for blk in self.cls_pt_layers: + x_cls_pt = blk(x_cls_pt, rel_pos_bias=rel_pos_bias) + + return self.norm(x), self.norm(x_cls_pt) if self.shared_lm_head else self.cls_pt_norm(x_cls_pt) + + def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x, x_cls_pt = self.forward_features(x, bool_masked_pos=bool_masked_pos) + x = x[:, 1:] + x_cls_pt = x_cls_pt[:, 1:] + if return_patch_tokens: + return [x, x_cls_pt] + if return_all_tokens: + return [self.lm_head(x), self.lm_head(x_cls_pt) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt)] + else: + # return the masked tokens + return [self.lm_head(x[bool_masked_pos]), self.lm_head(x_cls_pt[bool_masked_pos]) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt[bool_masked_pos])] + + + +def beit_base_patch16_224_8k_vocab_cls_pt(pretrained=False, pretrained_weight=None, **kwargs): + if "num_classes" in kwargs: + _ = kwargs.pop("num_classes") + if 'vocab_size' in kwargs: + vocab_size = kwargs['vocab_size'] + _ = kwargs.pop("vocab_size") + else: + vocab_size = 8192 + model = VisionTransformerForMaskedImageModelingCLS( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer="nn.LayerNorm", vocab_size=vocab_size, **kwargs) + weight = paddle.load(pretrained_weight) + model.set_dict(weight) + model.default_cfg = _cfg() + return model \ No newline at end of file diff --git a/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py b/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py new file mode 100644 index 0000000000..52fe6ac11b --- /dev/null +++ b/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py @@ -0,0 +1,242 @@ +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.distributed as distributed +from einops import rearrange, repeat + +from .modeling_finetune import zeros_, ones_, Identity + +def l2norm(t): + return F.normalize(t, p=2, axis=-1) + +def ema_inplace(moving_avg, new, decay): + x = moving_avg * decay + x = x + new*(1-decay) + moving_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + +def sample_vectors(samples, num): + num_samples, device = samples.shape[0], samples.device + + if num_samples >= num: + indices = paddle.randperm(num_samples)[:num] + else: + indices = paddle.randint(0, num_samples, [num,]) + + return samples[indices] + +def kmeans(samples, num_clusters, num_iters = 10, use_cosine_sim = False): + dim, dtype, device = samples.shape[-1], samples.dtype, samples.device + + means = sample_vectors(samples, num_clusters) + + for _ in range(num_iters): + if use_cosine_sim: + dists = samples @ means.t() + else: + diffs = rearrange(samples, 'n d -> n () d') \ + - rearrange(means, 'c d -> () c d') + dists = -(diffs ** 2).sum(axis = -1) + + buckets = dists.max(axis = -1).indices + bins = paddle.bincount(buckets, minlength = num_clusters) + zero_mask = bins == 0 + bins_min_clamped = bins.masked_fill(zero_mask, 1) + + new_means = buckets.new_zeros(num_clusters, dim, dtype = dtype) + new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d = dim), samples) + new_means = new_means / bins_min_clamped[..., None] + + if use_cosine_sim: + new_means = l2norm(new_means) + + means = paddle.where(zero_mask[..., None], means, new_means) + + return means, bins + + +class EmbeddingEMA(nn.Layer): + def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): + super().__init__() + self.num_tokens = num_tokens + self.codebook_dim = codebook_dim + self.decay = decay + self.eps = eps + if codebook_init_path == '': + if not kmeans_init: + weight = paddle.randn([num_tokens, codebook_dim]) + weight = l2norm(weight) + else: + weight = paddle.zeros([num_tokens, codebook_dim]) + self.register_buffer('initted', paddle.to_tensor([not kmeans_init], dtype='float32')) + else: + print(f"load init codebook weight from {codebook_init_path}") + codebook_ckpt_weight = paddle.load(codebook_init_path, map_location='cpu') + weight = codebook_ckpt_weight.clone() + self.register_buffer('initted', paddle.to_tensor([True])) + + self.weight = paddle.create_parameter(shape=weight.shape, + dtype=str(weight.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(weight)) + self.cluster_size = self.create_parameter(shape=[num_tokens], default_initializer=zeros_) + self.add_parameter("cluster_size", self.cluster_size) + self.embed_avg = paddle.create_parameter(shape=weight.shape, + dtype=str(weight.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(weight)) + self.update = True + + def init_embed_(self, data): + if self.initted: + return + print("Performing Kemans init for codebook") + embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True) + self.weight = paddle.create_parameter(shape=embed.shape, + dtype=str(embed.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(embed)) + self.cluster_size = paddle.create_parameter(shape=cluster_size.shape, + dtype=str(cluster_size.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(cluster_size)) + self.initted = paddle.create_parameter(shape=[1], + dtype="bool", + default_initializer=paddle.nn.initializer.Assign(paddle.to_tensor([True]))) + + def forward(self, embed_id): + return F.embedding(embed_id, self.weight) + + def cluster_size_ema_update(self, new_cluster_size): + x = self.cluster_size.multiply(self.decay) + x = x.add(new_cluster_size*(1 - self.decay)) + self.cluster_size = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def embed_avg_ema_update(self, new_embed_avg): + x = self.cluster_size.multiply(self.decay) + x = x.add(new_embed_avg*(1 - self.decay)) + self.embed_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def weight_update(self, num_tokens): + n = self.cluster_size.sum() + smoothed_cluster_size = ( + (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n + ) + #normalize embedding average with smoothed cluster size + embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) + # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1)) + self.weight = paddle.create_parameter(shape=embed_normalized.shape, + dtype=str(embed_normalized.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(embed_normalized)) + + +def norm_ema_inplace(moving_avg, new, decay): + x = moving_avg.multiply(paddle.to_tensor(decay)) + x = x.add(new*(1 - decay)) + x = l2norm(x) + moving_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + + +class NormEMAVectorQuantizer(nn.Layer): + def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, + statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): + super().__init__() + self.codebook_dim = embedding_dim + self.num_tokens = n_embed + self.beta = beta + self.decay = decay + + # learnable = True if orthogonal_reg_weight > 0 else False + self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) + + self.statistic_code_usage = statistic_code_usage + if statistic_code_usage: + self.register_buffer('cluster_size', paddle.zeros([n_embed])) + # if distributed.is_available() and distributed.is_initialized(): + # print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") + # self.all_reduce_fn = distributed.all_reduce + # else: + # self.all_reduce_fn = Identity + # self.all_reduce_fn = paddle.distributed.all_reduce() + + def reset_cluster_size(self, device): + if self.statistic_code_usage: + self.register_buffer('cluster_size', paddle.zeros([self.num_tokens])) + self.cluster_size = self.cluster_size.to(device) + + def _masked_fill(self, x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + def forward(self, z): + # reshape z -> (batch, height, width, channel) and flatten + #z, 'b c h w -> b h w c' + b, c, h, w = z.shape + z = paddle.reshape(z, [b, h, w, c]) + # z = rearrange(z, 'b c h w -> b h w c') + z = l2norm(z) + z_flattened = z.reshape([-1, self.codebook_dim]) + + self.embedding.init_embed_(z_flattened) + + d = z_flattened.pow(2).sum(axis=1, keepdim=True) + \ + self.embedding.weight.pow(2).sum(axis=1) - 2 * \ + paddle.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' + + encoding_indices = paddle.argmin(d, axis=1) + + z_q = self.embedding(encoding_indices).reshape(z.shape) + + encodings = F.one_hot(encoding_indices, self.num_tokens).astype(z.dtype) + + if not self.training: + with paddle.no_grad(): + cluster_size = encodings.sum(0) + # self.all_reduce_fn(cluster_size) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(cluster_size) + ema_inplace(self.cluster_size, cluster_size, self.decay) + + if self.training and self.embedding.update: + # EMA cluster size + + bins = encodings.sum(0) + # self.all_reduce_fn(bins) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(bins) + + # self.embedding.cluster_size_ema_update(bins) + ema_inplace(self.cluster_size, bins, self.decay) + + zero_mask = (bins == 0) + # bins = bins.masked_fill(zero_mask, 1.) + bins = self._masked_fill(bins, zero_mask, 1.) + + embed_sum = z_flattened.t() @ encodings + # self.all_reduce_fn(embed_sum) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(embed_sum) + + embed_normalized = (embed_sum / bins.unsqueeze(0)).t() + embed_normalized = l2norm(embed_normalized) + + embed_normalized = paddle.where(zero_mask[..., None], self.embedding.weight, + embed_normalized) + norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) + + # compute loss for embedding + loss = self.beta * F.mse_loss(z_q.detach(), z) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # reshape back to match original input shape + #z_q, 'b h w c -> b c h w' + b, h, w, c = z_q.shape + z_q = paddle.reshape(z_q, [b, c, h, w]) + # z_q = rearrange(z_q, 'b h w c -> b c h w') + return z_q, loss, encoding_indices diff --git a/ppcls/arch/backbone/model_zoo/vqkd.py b/ppcls/arch/backbone/model_zoo/vqkd.py new file mode 100644 index 0000000000..08140d4bb4 --- /dev/null +++ b/ppcls/arch/backbone/model_zoo/vqkd.py @@ -0,0 +1,308 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Code was heavily based on https://github.com/facebookresearch/deit +# reference: https://arxiv.org/abs/2012.12877 + +import math +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal + +from .modeling_finetune import VisionTransformer, zeros_, ones_ +from .norm_ema_quantizer import NormEMAVectorQuantizer + +MODEL_URLS = { + "vqkd": + "vqkd.pdparams", +} + +__all__ = list(MODEL_URLS.keys()) + +trunc_normal_ = TruncatedNormal(std=.02) +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) + +class VQKD(nn.Layer): + def __init__(self, + encoder_config, + decoder_config, + n_embed=8192, + embed_dim=32, + decay=0.99, + process_type='default', + quantize_kmeans_init=True, + teacher_model_type='clip', + decoder_out_dim=512, + rec_loss_type='cosine', + **kwargs + ): + super().__init__() + print(kwargs) + if decoder_config['in_chans'] != embed_dim: + print(f"Rewrite the in_chans in decoder from {decoder_config['in_chans']} to {embed_dim}") + decoder_config['in_chans'] = embed_dim + + # encoder & decode params + print('Final encoder config', encoder_config) + self.encoder = VisionTransformer(**encoder_config) + + print('Final decoder config', decoder_config) + self.decoder = VisionTransformer(**decoder_config) + + self.quantize = NormEMAVectorQuantizer( + n_embed=n_embed, embedding_dim=embed_dim, beta=1.0, kmeans_init=quantize_kmeans_init, decay=decay, + ) + + self.patch_size = encoder_config['patch_size'] + self.token_shape = (encoder_config['img_size'] // self.patch_size, encoder_config['img_size'] // self.patch_size) + + ## Teacher model setting + self.teacher_model_type = teacher_model_type + self.decoder_out_dim = decoder_out_dim + self.teacher_model = None + # if self.teacher_model_type == 'clip': + # self.scaling_layer = ScalingLayerForClip() + # self.teacher_model, _ = clip.load("ViT-B/16", device='cpu', jit=False) + # self.decoder_out_dim = 512 + + # elif self.teacher_model_type == 'dino': + # self.scaling_layer = ScalingLayerForIM() + # self.teacher_model = get_dino_vit_base() + # self.decoder_out_dim = 768 + + # else: + # self.teacher_model = None + + # if self.teacher_model is not None: + # for param in self.teacher_model.parameters(): + # param.requires_grad = False # fix teacher_model model + + # self.teacher_model.eval() + # self.teacher_input_size = kwargs.get('teacher_input_size', 224) + + # task layer + self.encode_task_layer = nn.Sequential( + nn.Linear(encoder_config['embed_dim'], encoder_config['embed_dim']), + nn.Tanh(), + nn.Linear(encoder_config['embed_dim'], embed_dim) # for quantize + ) + self.decode_task_layer = nn.Sequential( + nn.Linear(decoder_config['embed_dim'], decoder_config['embed_dim']), + nn.Tanh(), + nn.Linear(decoder_config['embed_dim'], self.decoder_out_dim), + ) + + self.rec_loss_type = rec_loss_type + + print(f"process type for VQKD: {process_type}") + self.process_type = process_type # in ['default', 'dall-e'] + self.logit_laplace_eps = 0.1 + self.kwargs = kwargs + + self.encode_task_layer.apply(self._init_weights) + self.decode_task_layer.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def no_weight_decay(self): + return {'quantize.embedding.weight', 'decoder.cls_token', 'decoder.pos_embed', + 'encoder.cls_token', 'encoder.pos_embed'} + + def device(self): + return self.decoder.cls_token.device + + def pre_process(self, data): + if self.process_type == 'default': + # TODO: modify for adapt + if data.max() <= 1.: + data = data * 255. + data = data / 127.5 - 1.0 + elif self.process_type == 'imagenet_norm': + mean = paddle.to_tensor(IMAGENET_DEFAULT_MEAN).set_device(self.device)[None, :, None, None] + std = paddle.to_tensor(IMAGENET_DEFAULT_STD).set_device(self.device)[None, :, None, None] + data = (data - mean) / std + return data + + def get_number_of_tokens(self): + return self.quantize.n_e + + def get_tokens(self, data, **kwargs): + data = self.pre_process(data) + quantize, embed_ind, loss = self.encode(data) + output = {} + output['token'] = embed_ind.reshape([data.shape[0], -1]) + output['input_img'] = data + + return output + + def encode(self, x): + encoder_features = self.encoder(x, return_patch_tokens=True) + + with paddle.amp.auto_cast(enable=False): + to_quantizer_features = self.encode_task_layer(encoder_features.astype(self.encode_task_layer[-1].weight.dtype)) + # to_quantizer_features = self.encode_task_layer(encoder_features.astype(self.encode_task_layer[-1].weight.dtype)) + + N = to_quantizer_features.shape[1] + h, w = int(math.sqrt(N)), int(math.sqrt(N)) + + b, c = to_quantizer_features.shape[0], to_quantizer_features.shape[-1] + to_quantizer_features = paddle.reshape(to_quantizer_features, [b, c, h, w]) + # to_quantizer_features = rearrange(to_quantizer_features, 'b (h w) c -> b c h w', h=h, w=w) # reshape for quantizer + quantize, loss, embed_ind = self.quantize(to_quantizer_features) + + return quantize, embed_ind, loss + + def decode(self, quantize, **kwargs): + # reshape tokens to feature maps for patch embed in decoder + # quantize = rearrange(quantize, 'b (h w) c -> b c h w', h=self.token_shape[0], w=self.token_shape[1]) + decoder_features = self.decoder(quantize, return_patch_tokens=True) + rec = self.decode_task_layer(decoder_features) + + return rec + + def get_codebook_indices(self, x, **kwargs): + # for beit pre-training + return self.get_tokens(x, **kwargs)['token'] + + def get_regress_target(self, x, **kwargs): + + norm_imgs = self.scaling_layer(x) + if self.teacher_model_type == 'clip': + target = self.teacher_model.encode_image(norm_imgs, return_all_tokens=True) @ self.teacher_model.visual.proj + elif self.teacher_model_type == 'dino': + target = self.teacher_model.forward(norm_imgs, return_patch_tokens=True) + else: + raise NotImplementedError + + return target + + def calculate_rec_loss(self, rec, target): + if self.rec_loss_type == 'cosine': + target = target / target.norm(dim=-1, keepdim=True) + rec = rec / rec.norm(dim=-1, keepdim=True) + rec_loss = (1 - (target * rec).sum(-1)).mean() + else: + raise NotImplementedError + + return rec_loss + + def forward(self, x, **kwargs): + """ + x: shape [B, 3, H, W] in [0, 1] + """ + x = self.pre_process(x) # rescale to [-1, 1] + + target = self.get_regress_target(x, **kwargs) + + quantize, embed_ind, emb_loss = self.encode(x) + xrec = self.decode(quantize) + + rec_loss = self.calculate_rec_loss(xrec, target) + loss = emb_loss + rec_loss + + log = {} + split="train" if self.training else "val" + log[f'{split}/quant_loss'] = emb_loss.detach().mean() + log[f'{split}/rec_loss'] = rec_loss.detach().mean() + log[f'{split}/total_loss'] = loss.detach().mean() + + return loss, log + + +class ScalingLayerForClip(nn.Layer): + def __init__(self): + super(ScalingLayerForClip, self).__init__() + self.register_buffer('shift', paddle.to_tensor([0.48145466, 0.4578275, 0.40821073])[None, :, None, None]) + self.register_buffer('scale', paddle.to_tensor([0.26862954, 0.26130258, 0.27577711])[None, :, None, None]) + + def forward(self, inp): + inp = ((inp + 1.) * 127.5).clamp(0, 255.) / 255. # rescale to [0, 1.] + return (inp - self.shift) / self.scale + +class ScalingLayerForIM(nn.Layer): + def __init__(self): + super(ScalingLayerForIM, self).__init__() + self.register_buffer('shift', paddle.to_tensor([0.485, 0.456, 0.406])[None, :, None, None]) # scale for tokenizer with default prosscess type \in [-1, 1] + self.register_buffer('scale', paddle.to_tensor([0.229, 0.224, 0.225])[None, :, None, None]) + + def forward(self, inp): + inp = ((inp + 1.) * 127.5).clamp(0, 255.) / 255. # rescale to [0, 1.] + return (inp - self.shift) / self.scale + + +def get_model_default_params(): + return dict(img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, num_heads=12, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + norm_layer="nn.LayerNorm", init_values=0., use_abs_pos_emb=True, + use_rel_pos_bias=False, use_shared_rel_pos_bias=False, use_mean_pooling=True, init_scale=0.001) + + +def vqkd_encoder_base_decoder_3x768x12_clip(pretrained=False, pretrained_weight=None, as_tokenzer=False, img_size=224, + n_code=8192, code_dim=32, **kwargs): + encoder_config, decoder_config = get_model_default_params(), get_model_default_params() + + # encoder settings + encoder_config['img_size'] = img_size + encoder_config['num_classes'] = 0 + # decoder settings + decoder_config['img_size'] = img_size // decoder_config['patch_size'] + decoder_config['patch_size'] = 1 + decoder_config['in_chans'] = code_dim + decoder_config['num_classes'] = 0 + decoder_config['depth'] = 3 + # teacher settings + _ = kwargs.pop("teacher_model_type", "clip") + + # teacher_model_type = 'clip' if not as_tokenzer else 'None' + teacher_model_type = 'None' + decoder_out_dim = 512 + + model = VQKD(encoder_config, + decoder_config, + n_code, + code_dim, + teacher_model_type=teacher_model_type, + decoder_out_dim=decoder_out_dim, + **kwargs) + + # if as_tokenzer: + # assert pretrained + # assert pretrained_weight is not None + + # if pretrained_weight.startswith('https'): + # weights = torch.hub.load_state_dict_from_url(pretrained_weight, map_location='cpu', check_hash=True) + # else: + # weights = torch.load(pretrained_weight, map_location='cpu') + + # if 'model' in weights: + # weights = weights['model'] + # else: + # weights = weights["state_dict"] + # keys = list(weights.keys()) + + # for k in keys: + # if k.startswith("loss") or k.startswith("teacher") or k.startswith("scaling"): + # del weights[k] + # model.load_state_dict(weights) + weight = paddle.load(pretrained_weight) + model.set_dict(weight) + return model \ No newline at end of file diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml new file mode 100644 index 0000000000..2b72bdf7d5 --- /dev/null +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml @@ -0,0 +1,170 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 50 + eval_during_train: True + eval_interval: 1 + epochs: 100 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + save_inference_dir: ./inference + to_static: False + seed: 0 + distributed: 4 + +# model architecture +Arch: + name: beit_base_patch16_224 + pretrained: True + finetune_weight: ./dataset/beitv2_base_patch16_224_pt1k.pdparams + model_filter_name: '' + num_classes: 1000 + drop_rate : 0.0 + drop_path_rate : 0.1 + attn_drop_rate: 0.0 + use_mean_pooling: True + init_scale: 0.001 + use_rel_pos_bias: True + use_abs_pos_emb: False + init_values: 0.1 + qkv_bias: True + + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + epsilon: 0.1 + Eval: + - CELoss: + weight: 1.0 + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.999 + epsilon: 1e-8 + weight_decay: 0.05 + no_weight_decay_name: norm cls_token pos_embed .bias gamma + one_dim_param_no_weight_decay: True + layer_decay: 0.65 + lr: + # for 8 cards + name: Cosine + learning_rate: 25e-5 + eta_min: 5e-7 + warmup_epoch: 20 + warmup_start_lr: 0 + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - RandomResizedCropAndInterpolation: + size: 224 + interpolation: bicubic + - RandomHorizontalFlip: + prob: 0.5 + - RawTimmAutoAugment: + config_str: rand-m9-mstd0.5-inc1 + interpolation: bicubic + img_size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - RandomErasing: + EPSILON: 0.25 + sl: 0.02 + sh: 1.0/3.0 + r1: 0.3 + attempt: 10 + use_log_aspect: True + mode: pixel + batch_transform_ops: + - MixupCutmixHybrid: + mixup_alpha: 0.8 + cutmix_alpha: 1.0 + num_classes: 1000 + sampler: + name: DistributedBatchSampler + batch_size: 128 + drop_last: False + shuffle: True + loader: + num_workers: 8 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + interpolation: bicubic + backend: pil + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 256 + drop_last: False + shuffle: False + loader: + num_workers: 4 + use_shared_memory: True + +Infer: + infer_imgs: ./dataset/ILSVRC2012/val/ILSVRC2012_val_00040137.JPEG + batch_size: 10 + transforms: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + interpolation: bilinear + - CenterCrop: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToTensor: + PostProcess: + name: Topk + topk: 5 + class_id_map_file: ppcls/utils/imagenet1k_label_list.txt + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml new file mode 100644 index 0000000000..eacaeea976 --- /dev/null +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml @@ -0,0 +1,165 @@ +# global configs +Global: + checkpoints: null + pretrained_model: null + output_dir: ./output/beitv2_clip_dist + device: gpu + save_interval: 1 + eval_during_train: False + eval_interval: 1 + epochs: 300 + print_batch_step: 10 + use_visualdl: False + # used for static mode and model export + image_shape: [3, 224, 224] + use_multilabel: True + save_inference_dir: ./inference + to_static: False + seed: 0 + distributed: 4 + +AMP: + scale_loss: 65536.0 + use_dynamic_loss_scaling: True + incr_every_n_steps: 2000 + # O1: mixed fp16 + level: O1 + +# model architecture +Arch: + name: "Beitv2Model" + drop_path_rate : 0.1 + class_num: &class_num 1000 + is_beitv2: True + # if not null, its lengths should be same as models + pretrained_list: + # if not null, its lengths should be same as models + freeze_params_list: + - True + - False + infer_model_name: "Student" + models: + - Teacher: + name: vqkd_encoder_base_decoder_3x768x12_clip + pretrained: True + pretrained_weight: /home/aistudio/weight/vqkd.pdparams + as_tokenzer: False + img_size: 224 + n_code: 8192 + code_dim: 32 + - Student: + name: beit_base_patch16_224_8k_vocab_cls_pt + pretrained: True + pretrained_weight: /home/aistudio/weight/pretrain_model.pdparams + drop_path_rate: 0.1 + use_shared_rel_pos_bias: True + use_abs_pos_emb: False + init_values: 0.1 + early_layers: 9 + head_layers: 2 + shared_lm_head: True + +# loss function config for traing/eval process +Loss: + Train: + - DistillationBeitV2CELoss: + weight: 1.0 + model_name_pairs: + - ["Teacher", "Student"] + Eval: + - CELoss: + weight: 1.0 + +Optimizer: + name: AdamW + beta1: 0.9 + beta2: 0.98 + momentum: 0.9 + weight_decay: 1e-4 + epsilon: 1e-8 + #multi precision: True + no_weight_decay_name: pos_embed cls_token .bias norm gamma + # Ir自定义 + lr: + name: Cosine + learning_rate: 0.0015 + T_max: 200 + eta_min: 1e-5 + warmup_steps: 10 + warmup_start_Ir: 0 + # end Ir 不用 + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: BEiT_ImageNet + image_root: /home/aistudio/data/data89857/ILSVRC2012mini/ + cls_label_path: /home/aistudio/data/data89857/ILSVRC2012mini/train_list.txt + transform_ops: + - DecodeImage: + to_rgb: True, + channel_first: False + - ColorJitter: + brightness: 0.4 + contrast: 0.4 + saturation: 0.4 + hue: 0.4 + - RandomHorizontalFlip: + - RandomResizedCropAndInterpolationWithTwoPic: + size: 224 + second_size: 224 + scale: [0.2, 1.0] + interpolation: 'bicubic' + second_interpolation: 'bicubic' + patch_transforms: + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + channel_num: 3 + - ToTensor: + visual_token_transforms: + - ToTensor: + masking_generator: + input_size: 14 + num_masking_patches: 75 + max_num_patches: None + min_num_patches: 16 + sampler: + name: DistributedBatchSampler + batch_size: 128 + drop_last: False + shuffle: True + loader: + num_workers: 4 + use_shared_memory: True + + Eval: + dataset: + name: ImageNetDataset + image_root: /home/aistudio/data/data89857/ILSVRC2012mini/val/ + cls_label_path: /home/aistudio/data/data89857/ILSVRC2012mini/val_list.txt + transform_ops: + - DecodeImage: + to_rgb: True + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + sampler: + name: DistributedBatchSampler + batch_size: 256 + drop_last: False + shuffle: False + loader: + num_workers: 8 + use_shared_memory: True + diff --git a/ppcls/data/__init__.py b/ppcls/data/__init__.py index df35eef640..9e169cc28d 100644 --- a/ppcls/data/__init__.py +++ b/ppcls/data/__init__.py @@ -37,6 +37,7 @@ from ppcls.data.dataloader.custom_label_dataset import CustomLabelDataset from ppcls.data.dataloader.cifar import Cifar10, Cifar100 from ppcls.data.dataloader.metabin_sampler import DomainShuffleBatchSampler, NaiveIdentityBatchSampler +from ppcls.data.dataloader.beitdataset import BEiT_ImageNet # sampler from ppcls.data.dataloader.DistributedRandomIdentitySampler import DistributedRandomIdentitySampler diff --git a/ppcls/data/dataloader/beitdataset.py b/ppcls/data/dataloader/beitdataset.py new file mode 100644 index 0000000000..6292620bd8 --- /dev/null +++ b/ppcls/data/dataloader/beitdataset.py @@ -0,0 +1,63 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import os +from .common_dataset import CommonDataset, create_operators +from ..preprocess.ops.masking_generator import MaskingGenerator +import numpy as np +from ppcls.data.preprocess import transform + +class BEiT_ImageNet(CommonDataset): + cls_filter = None + + def __init__(self, + image_root, + cls_label_path, + transform_ops=None, + patch_transforms=None, + visual_token_transforms=None, + masking_generator=None): + super(BEiT_ImageNet, self).__init__(image_root, cls_label_path, + transform_ops) + + self._patch_transform = create_operators(patch_transforms) + self._visual_token_transform = create_operators(visual_token_transforms) + self._masked_position_generator = MaskingGenerator(**masking_generator) + + def _load_anno(self): + assert os.path.exists( + self._cls_path), f"path {self._cls_path} does not exist." + assert os.path.exists( + self._img_root), f"path {self._img_root} does not exist." + self.images = [] + self.labels = [] + + with open(self._cls_path) as fd: + lines = fd.readlines() + for line in lines: + line = line.strip().split(" ") + self.images.append(os.path.join(self._img_root, line[0])) + self.labels.append(np.int64(line[1])) + assert os.path.exists(self.images[ + -1]), f"path {self.images[-1]} does not exist." + + def __getitem__(self, idx): + with open(self.images[idx], 'rb') as f: + img = f.read() + for_patches, for_visual_tokens = transform(img, self._transform_ops) + return \ + (transform(for_patches, self._patch_transform), \ + transform(for_visual_tokens, self._visual_token_transform), \ + self._masked_position_generator()) diff --git a/ppcls/data/preprocess/__init__.py b/ppcls/data/preprocess/__init__.py index 66234a44bd..342f15073f 100644 --- a/ppcls/data/preprocess/__init__.py +++ b/ppcls/data/preprocess/__init__.py @@ -20,9 +20,10 @@ from ppcls.data.preprocess.ops.cutout import Cutout from ppcls.data.preprocess.ops.hide_and_seek import HideAndSeek -from ppcls.data.preprocess.ops.random_erasing import RandomErasing +from ppcls.data.preprocess.ops.random_erasing import RandomErasing, BeitV2RandomErasing from ppcls.data.preprocess.ops.grid import GridMask - +from ppcls.data.preprocess.ops.random_crop_and_interpolation import RandomResizedCropAndInterpolationWithTwoPic, RandomResizedCropAndInterpolation +from ppcls.data.preprocess.ops.masking_generator import MaskingGenerator from ppcls.data.preprocess.ops.operators import DecodeImage from ppcls.data.preprocess.ops.operators import ResizeImage from ppcls.data.preprocess.ops.operators import CropImage diff --git a/ppcls/data/preprocess/ops/masking_generator.py b/ppcls/data/preprocess/ops/masking_generator.py new file mode 100644 index 0000000000..7aa6201a67 --- /dev/null +++ b/ppcls/data/preprocess/ops/masking_generator.py @@ -0,0 +1,118 @@ +""" +Originally inspired by impl at https://github.com/zhunzhong07/Random-Erasing, Apache 2.0 +Copyright Zhun Zhong & Liang Zheng + +Hacked together by / Copyright 2020 Ross Wightman + +Modified by Hangbo Bao, for generating the masked position for visual image transformer +""" +# -------------------------------------------------------- +# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) +# Github source: https://github.com/microsoft/unilm/tree/master/beit +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# By Hangbo Bao +# Based on timm, DINO and DeiT code bases +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# Originally inspired by impl at https://github.com/zhunzhong07/Random-Erasing, Apache 2.0 +# Copyright Zhun Zhong & Liang Zheng +# +# Hacked together by / Copyright 2020 Ross Wightman +# +# Modified by Hangbo Bao, for generating the masked position for visual image transformer +# --------------------------------------------------------' +import random +import math +import numpy as np + + +class MaskingGenerator: + def __init__( + self, input_size, num_masking_patches, min_num_patches=4, max_num_patches=None, + min_aspect=0.3, max_aspect=None): + if not isinstance(input_size, tuple): + input_size = (input_size, ) * 2 + self.height, self.width = input_size + + self.num_patches = self.height * self.width + self.num_masking_patches = num_masking_patches + + self.min_num_patches = min_num_patches + self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches + + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + + def __repr__(self): + repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % ( + self.height, self.width, self.min_num_patches, self.max_num_patches, + self.num_masking_patches, self.log_aspect_ratio[0], self.log_aspect_ratio[1]) + return repr_str + + def get_shape(self): + return self.height, self.width + + def _mask(self, mask, max_mask_patches): + delta = 0 + for attempt in range(10): + target_area = random.uniform(self.min_num_patches, max_mask_patches) + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < self.width and h < self.height: + top = random.randint(0, self.height - h) + left = random.randint(0, self.width - w) + + num_masked = mask[top: top + h, left: left + w].sum() + # Overlap + if 0 < h * w - num_masked <= max_mask_patches: + for i in range(top, top + h): + for j in range(left, left + w): + if mask[i, j] == 0: + mask[i, j] = 1 + delta += 1 + + if delta > 0: + break + return delta + + def __call__(self): + mask = np.zeros(shape=self.get_shape(), dtype=np.int32) + mask_count = 0 + while mask_count < self.num_masking_patches: + max_mask_patches = self.num_masking_patches - mask_count + max_mask_patches = min(max_mask_patches, self.max_num_patches) + + delta = self._mask(mask, max_mask_patches) + if delta == 0: + break + else: + mask_count += delta + + # maintain a fix number {self.num_masking_patches} + if mask_count > self.num_masking_patches: + delta = mask_count - self.num_masking_patches + mask_x, mask_y = mask.nonzero() + to_vis = np.random.choice(mask_x.shape[0], delta, replace=False) + mask[mask_x[to_vis], mask_y[to_vis]] = 0 + + elif mask_count < self.num_masking_patches: + delta = self.num_masking_patches - mask_count + mask_x, mask_y = (mask == 0).nonzero() + to_mask = np.random.choice(mask_x.shape[0], delta, replace=False) + mask[mask_x[to_mask], mask_y[to_mask]] = 1 + + assert mask.sum() == self.num_masking_patches, f"mask: {mask}, mask count {mask.sum()}" + + return mask + + +if __name__ == '__main__': + import pdb + generator = MaskingGenerator(input_size=14, num_masking_patches=118, min_num_patches=16,) + for i in range(10000000): + mask = generator() + if mask.sum() != 118: + pdb.set_trace() + print(mask) + print(mask.sum()) \ No newline at end of file diff --git a/ppcls/data/preprocess/ops/random_crop_and_interpolation.py b/ppcls/data/preprocess/ops/random_crop_and_interpolation.py new file mode 100644 index 0000000000..2fcbdcaba7 --- /dev/null +++ b/ppcls/data/preprocess/ops/random_crop_and_interpolation.py @@ -0,0 +1,285 @@ +import paddle +import paddle.vision.transforms.functional as F +from PIL import Image +import warnings +import math +import random +import numpy as np + + +class ToNumpy: + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return np_img + + +class ToTensor: + + def __init__(self, dtype=paddle.float32): + self.dtype = dtype + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return paddle.to_tensor(np_img, dtype=self.dtype) + + +_pil_interpolation_to_str = { + "nearest": 'PIL.Image.NEAREST', + "bilinear" : 'PIL.Image.BILINEAR', + "bicubic": 'PIL.Image.BICUBIC', + "lanczos": 'PIL.Image.LANCZOS', + "hamming": 'PIL.Image.HAMMING', + "box": 'PIL.Image.BOX', +} + + +def _pil_interp(method): + if method == 'bicubic': + return method + elif method == 'lanczos': + return method + elif method == 'hamming': + return method + else: + # default bilinear, do we want to allow nearest? + return 'bilinear' + + +_RANDOM_INTERPOLATION = ('bilinear', 'bicubic') + + +class RandomResizedCropAndInterpolationWithTwoPic: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear', second_interpolation='lanczos'): + if isinstance(size, tuple): + self.size = size + else: + self.size = (size, size) + if second_size is not None: + if isinstance(second_size, tuple): + self.second_size = second_size + else: + self.second_size = (second_size, second_size) + else: + self.second_size = None + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + if interpolation == 'random': + self.interpolation = _RANDOM_INTERPOLATION + else: + self.interpolation = _pil_interp(interpolation) + self.second_interpolation = _pil_interp(second_interpolation) + self.scale = scale + self.ratio = ratio + + def _get_params(self, img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img): + """ + Args: + img (PIL Image): Image to be cropped and resized. + + Returns: + PIL Image: Randomly cropped and resized image. + """ + if isinstance(img, np.ndarray): + img = Image.fromarray(img) + i, j, h, w = self._get_params(img, self.scale, self.ratio) + if isinstance(self.interpolation, (tuple, list)): + interpolation = random.choice(self.interpolation) + else: + interpolation = self.interpolation + if self.second_size is None: + img_ = F.crop(img, i, j, h, w) + img_ = F.resize(img_, self.size, interpolation) + return img_ + else: + img1 = F.crop(img, i, j, h, w) + img1 = F.resize(img1, self.size, interpolation) + img2 = F.crop(img, i, j, h, w) + img2 = F.resize(img2, self.second_size, self.second_interpolation) + return img1, img2 + + def __repr__(self): + if isinstance(self.interpolation, (tuple, list)): + interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) + else: + interpolate_str = _pil_interpolation_to_str[self.interpolation] + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0}'.format(interpolate_str) + if self.second_size is not None: + format_string += ', second_size={0}'.format(self.second_size) + format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation]) + format_string += ')' + return format_string + + +class RandomResizedCropAndInterpolation: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear'): + if isinstance(size, (list, tuple)): + self.size = tuple(size) + else: + self.size = (size, size) + + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + if interpolation == 'random': + self.interpolation = _RANDOM_INTERPOLATION + else: + self.interpolation = _pil_interp(interpolation) + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img): + """ + Args: + img (PIL Image): Image to be cropped and resized. + + Returns: + PIL Image: Randomly cropped and resized image. + """ + if isinstance(img, np.ndarray): + img = Image.fromarray(img) + i, j, h, w = self.get_params(img, self.scale, self.ratio) + if isinstance(self.interpolation, (tuple, list)): + interpolation = random.choice(self.interpolation) + else: + interpolation = self.interpolation + + img_ = F.crop(img, i, j, h, w) + img_ = F.resize(img_, self.size, interpolation) + return img_ + + def __repr__(self): + if isinstance(self.interpolation, (tuple, list)): + interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) + else: + interpolate_str = _pil_interpolation_to_str[self.interpolation] + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0})'.format(interpolate_str) + return format_string \ No newline at end of file diff --git a/ppcls/data/preprocess/ops/random_erasing.py b/ppcls/data/preprocess/ops/random_erasing.py index e687283c7a..15e18e250d 100644 --- a/ppcls/data/preprocess/ops/random_erasing.py +++ b/ppcls/data/preprocess/ops/random_erasing.py @@ -19,11 +19,24 @@ import math import random - +import paddle import numpy as np from .operators import format_data +def _get_pixels(per_pixel, rand_color, patch_size, dtype=paddle.float32, device='cuda'): + # NOTE I've seen CUDA illegal memory access errors being caused by the normal_() + # paths, flip the order so normal is run on CPU if this becomes a problem + # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508 + if per_pixel: + # return torch.empty(patch_size, dtype=dtype, device=device).normal_() + return paddle.normal(shape=patch_size) + elif rand_color: + # return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_() + return paddle.normal(shape=[patch_size[0], 1, 1]) + else: + # return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device) + return paddle.zeros([patch_size[0], 1, 1], dtype=dtype) class Pixels(object): def __init__(self, mode="const", mean=[0., 0., 0.]): @@ -111,3 +124,77 @@ def __call__(self, img): return img return img + +class BeitV2RandomErasing(object): + """ Randomly selects a rectangle region in an image and erases its pixels. + 'Random Erasing Data Augmentation' by Zhong et al. + See https://arxiv.org/pdf/1708.04896.pdf + + This variant of BeitV2RandomErasing is intended to be applied to either a batch + or single image tensor after it has been normalized by dataset mean and std. + Args: + probability: Probability that the Random Erasing operation will be performed. + min_area: Minimum percentage of erased area wrt input image area. + max_area: Maximum percentage of erased area wrt input image area. + min_aspect: Minimum aspect ratio of erased area. + mode: pixel color mode, one of 'const', 'rand', or 'pixel' + 'const' - erase block is constant color of 0 for all channels + 'rand' - erase block is same per-channel random (normal) color + 'pixel' - erase block is per-pixel random (normal) color + max_count: maximum number of erasing blocks per image, area per box is scaled by count. + per-image count is randomly chosen between 1 and this value. + """ + + def __init__( + self, + probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0, device='cuda'): + self.probability = probability + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if mode == 'rand': + self.rand_color = True # per block random normal + elif mode == 'pixel': + self.per_pixel = True # per pixel random normal + else: + assert not mode or mode == 'const' + self.device = device + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.probability: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top + h, left:left + w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype, device=self.device) + break + + def __call__(self, input): + if len(input.shape) == 3: + self._erase(input, *input.shape, input.dtype) + else: + batch_size, chan, img_h, img_w = input.shape + # skip first slice of batch if num_splits is set (for clean portion of samples) + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input \ No newline at end of file diff --git a/ppcls/data/preprocess/ops/transforms_factory.py b/ppcls/data/preprocess/ops/transforms_factory.py new file mode 100644 index 0000000000..cb7d58ceb0 --- /dev/null +++ b/ppcls/data/preprocess/ops/transforms_factory.py @@ -0,0 +1,236 @@ +import math +from PIL import Image + +import paddle +from paddle.vision import transforms +from .random_crop_and_interpolation import ToNumpy +from .timm_autoaugment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform +from .random_crop_and_interpolation import RandomResizedCropAndInterpolation +from .random_erasing import BeitV2RandomErasing +import numpy as np +import random + +def transforms_noaug_train( + img_size=224, + interpolation='bilinear', + use_prefetcher=False, + mean = [0.485, 0.456, 0.406], + std = [0.229, 0.224, 0.225] +): + if interpolation == 'random': + # random interpolation not supported with no-aug + interpolation = 'bilinear' + tfl = [ + transforms.Resize(img_size, interpolation=interpolation), + transforms.CenterCrop(img_size) + ] + if use_prefetcher: + # prefetcher and collate will handle tensor conversion and norm + tfl += [ToNumpy()] + else: + tfl += [ + transforms.ToTensor(), + transforms.Normalize( + mean=mean, + std=std) + ] + return transforms.Compose(tfl) + + + +def _pil_interp(method): + if method == 'bicubic': + return Image.BICUBIC + elif method == 'lanczos': + return Image.LANCZOS + elif method == 'hamming': + return Image.HAMMING + else: + # default bilinear, do we want to allow nearest? + return Image.BILINEAR + +def transforms_imagenet_train( + img_size=224, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + interpolation='random', + use_prefetcher=False, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0, + separate=False, +): + """ + If separate==True, the transforms are returned as a tuple of 3 separate transforms + for use in a mixing dataset that passes + * all data through the first (primary) transform, called the 'clean' data + * a portion of the data through the secondary transform + * normalizes and converts the branches above with the third, final transform + """ + scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range + ratio = tuple(ratio or (3./4., 4./3.)) # default imagenet ratio range + primary_tfl = [ + RandomResizedCropAndInterpolation(img_size, scale=scale, ratio=ratio, interpolation=interpolation)] + if hflip > 0.: + primary_tfl += [transforms.RandomHorizontalFlip(prob=hflip)] + if vflip > 0.: + primary_tfl += [transforms.RandomVerticalFlip(prob=vflip)] + + secondary_tfl = [] + if auto_augment: + assert isinstance(auto_augment, str) + if isinstance(img_size, (tuple, list)): + img_size_min = min(img_size) + else: + img_size_min = img_size + aa_params = dict( + translate_const=int(img_size_min * 0.45), + img_mean=tuple([min(255, round(255 * x)) for x in mean]), + ) + if interpolation and interpolation != 'random': + aa_params['interpolation'] = _pil_interp(interpolation) + if auto_augment.startswith('rand'): + secondary_tfl += [rand_augment_transform(auto_augment, aa_params)] + elif auto_augment.startswith('augmix'): + aa_params['translate_pct'] = 0.3 + secondary_tfl += [augment_and_mix_transform(auto_augment, aa_params)] + else: + secondary_tfl += [auto_augment_transform(auto_augment, aa_params)] + elif color_jitter is not None: + # color jitter is enabled when not using AA + if isinstance(color_jitter, (list, tuple)): + # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation + # or 4 if also augmenting hue + assert len(color_jitter) in (3, 4) + else: + # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue + color_jitter = (float(color_jitter),) * 3 + secondary_tfl += [transforms.ColorJitter(*color_jitter)] + + final_tfl = [] + if use_prefetcher: + # prefetcher and collate will handle tensor conversion and norm + final_tfl += [ToNumpy()] + else: + final_tfl += [ + transforms.ToTensor(), + transforms.Normalize( + mean=mean, + std=std) + ] + if re_prob > 0.: + final_tfl.append( + BeitV2RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu')) + + if separate: + return transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl) + else: + return transforms.Compose(primary_tfl + secondary_tfl + final_tfl) + + +def transforms_imagenet_eval( + img_size=224, + crop_pct=None, + interpolation='bilinear', + use_prefetcher=False, + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]): + crop_pct = crop_pct + + if isinstance(img_size, (tuple, list)): + assert len(img_size) == 2 + if img_size[-1] == img_size[-2]: + # fall-back to older behaviour so Resize scales to shortest edge if target is square + scale_size = int(math.floor(img_size[0] / crop_pct)) + else: + scale_size = tuple([int(x / crop_pct) for x in img_size]) + else: + scale_size = int(math.floor(img_size / crop_pct)) + + tfl = [ + transforms.Resize(scale_size, interpolation), + transforms.CenterCrop(img_size), + ] + if use_prefetcher: + # prefetcher and collate will handle tensor conversion and norm + tfl += [ToNumpy()] + else: + tfl += [ + transforms.ToTensor(), + transforms.Normalize( + mean=mean, + std=std) + ] + + return transforms.Compose(tfl) + + +def create_transform( + input_size, + is_training=False, + use_prefetcher=False, + no_aug=False, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + interpolation='bilinear', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225], + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0, + crop_pct=None, + tf_preprocessing=False, + separate=False): + + if isinstance(input_size, (tuple, list)): + img_size = input_size[-2:] + else: + img_size = input_size + + if is_training and no_aug: + transform = transforms_noaug_train( + img_size, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + mean=mean, + std=std) + elif is_training: + transform = transforms_imagenet_train( + img_size, + scale=scale, + ratio=ratio, + hflip=hflip, + vflip=vflip, + color_jitter=color_jitter, + auto_augment=auto_augment, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + mean=mean, + std=std, + re_prob=re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + separate=separate) + else: + transform = transforms_imagenet_eval( + img_size, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + mean=mean, + std=std, + crop_pct=crop_pct) + + return transform \ No newline at end of file diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py index 7058ec495e..21d318d8d1 100755 --- a/ppcls/engine/engine.py +++ b/ppcls/engine/engine.py @@ -28,7 +28,7 @@ from ppcls.utils.logger import init_logger from ppcls.utils.config import print_config from ppcls.data import build_dataloader -from ppcls.arch import build_model, RecModel, DistillationModel, TheseusLayer +from ppcls.arch import build_model, RecModel, DistillationModel, TheseusLayer, Beitv2Model from ppcls.arch import apply_to_static from ppcls.loss import build_loss from ppcls.metric import build_metrics @@ -60,6 +60,10 @@ def __init__(self, config, mode="train"): self.is_rec = True else: self.is_rec = False + if self.config["Arch"].get("is_beitv2", False): + self.is_beitv2 = True + else: + self.is_beitv2 = False # set seed seed = self.config["Global"].get("seed", False) diff --git a/ppcls/engine/train/train.py b/ppcls/engine/train/train.py index f5ec7a88df..b93cc01d82 100644 --- a/ppcls/engine/train/train.py +++ b/ppcls/engine/train/train.py @@ -107,7 +107,9 @@ def train_epoch(engine, epoch_id, print_batch_step): def forward(engine, batch): - if not engine.is_rec: + if not engine.is_rec and not engine.is_beitv2: return engine.model(batch[0]) - else: + elif engine.is_rec: return engine.model(batch[0], batch[1]) + else: + return engine.model(batch[0], batch[1], batch[2]) \ No newline at end of file diff --git a/ppcls/loss/__init__.py b/ppcls/loss/__init__.py index adf770dfd2..aaf3ec084b 100644 --- a/ppcls/loss/__init__.py +++ b/ppcls/loss/__init__.py @@ -34,7 +34,7 @@ from .distillationloss import DistillationMultiLabelLoss from .distillationloss import DistillationDISTLoss from .distillationloss import DistillationPairLoss - +from .distillationloss import DistillationBeitV2CELoss from .multilabelloss import MultiLabelLoss from .afdloss import AFDLoss diff --git a/ppcls/loss/distillationloss.py b/ppcls/loss/distillationloss.py index 6ccbbb8406..49435dba8e 100644 --- a/ppcls/loss/distillationloss.py +++ b/ppcls/loss/distillationloss.py @@ -424,3 +424,33 @@ def forward(self, predicts, batch): else: loss_dict[f"{self.name}_{idx}_{pair[0]}_{pair[1]}"] = loss return loss_dict + +class DistillationBeitV2CELoss(CELoss): + """ + DistillationBeitV2CELoss + """ + + def __init__(self, + model_name_pairs=[], + epsilon=None, + name="loss_beitv2"): + super().__init__(epsilon=epsilon) + assert isinstance(model_name_pairs, list) + self.model_name_pairs = model_name_pairs + self.name = name + + def forward(self, predicts, batch): + loss_dict = dict() + loss = dict() + for idx, pair in enumerate(self.model_name_pairs): + out1 = predicts[pair[0]] + out2 = predicts[pair[1]] + if isinstance(out2, list): + loss_1 = super().forward(out2[0], out1) + loss_2 = super().forward(out2[1], out1) + loss["CELoss"] = loss_1["CELoss"] + loss_2["CELoss"] + else: + loss = super().forward(out2, out1) + for key in loss: + loss_dict["{}_{}_{}".format(self.name, pair[0], pair[1])] = loss[key] + return loss_dict \ No newline at end of file diff --git a/ppcls/optimizer/optimizer.py b/ppcls/optimizer/optimizer.py index f3c3d354b8..70d81efd67 100644 --- a/ppcls/optimizer/optimizer.py +++ b/ppcls/optimizer/optimizer.py @@ -247,6 +247,7 @@ def __init__(self, grad_clip=None, no_weight_decay_name=None, one_dim_param_no_weight_decay=False, + layer_decay=1.0, **args): super().__init__() self.learning_rate = learning_rate @@ -259,11 +260,47 @@ def __init__(self, self.no_weight_decay_name_list = no_weight_decay_name.split( ) if no_weight_decay_name else [] self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay + self.layer_decay = layer_decay def __call__(self, model_list): # model_list is None in static graph - parameters = sum([m.parameters() for m in model_list], - []) if model_list else None + def get_num_layer_for_vit(var_name, num_max_layer): + if var_name in ("cls_token", "mask_token", "pos_embed"): + return 0 + elif var_name.startswith("patch_embed"): + return 0 + elif var_name.startswith("rel_pos_bias"): + return num_max_layer - 1 + elif var_name.startswith("blocks"): + layer_id = int(var_name.split('.')[1]) + return layer_id + 1 + else: + return num_max_layer - 1 + + class LayerDecayValueAssigner(object): + def __init__(self, values): + self.values = values + + def get_scale(self, layer_id): + return self.values[layer_id] + + def get_layer_id(self, var_name): + return get_num_layer_for_vit(var_name, len(self.values)) + + if self.layer_decay < 1.0: + parameters_list = [] + for m in model_list: + num_layers = m.get_num_layers() + assigner = LayerDecayValueAssigner(list(self.layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2))) + skip_weight_decay_list = m.no_weight_decay() + parameters_list.append(self._get_parameter_groups(m, self.weight_decay, skip_weight_decay_list, + get_num_layer=assigner.get_layer_id, + get_layer_scale=assigner.get_scale)) + parameters = sum(parameters_list, []) + self.weight_decay = 0. + else: + parameters = sum([m.parameters() for m in model_list], + []) if model_list else None # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work. if model_list is None: @@ -300,6 +337,48 @@ def __call__(self, model_list): def _apply_decay_param_fun(self, name): return name not in self.no_weight_decay_param_name_list + def _get_parameter_groups(self, model, weight_decay=0.05, skip_list=(), get_num_layer=None, get_layer_scale=None): + parameter_group_names = {} + parameter_group_vars = {} + for name, param in model.named_parameters(): + print(param.stop_gradient) + if param.stop_gradient: + continue + + if param.ndim <= 1 or name.endswith(".bias") or name in skip_list: # param.ndim <= 1 len(param.shape) == 1 + group_name = "no_decay" + this_weight_decay = 0. + else: + group_name = "decay" + this_weight_decay = weight_decay + + if get_num_layer is not None: + layer_id = get_num_layer(name) + group_name = "layer_%d_%s" % (layer_id, group_name) + else: + layer_id = None + + if group_name not in parameter_group_names: + if get_layer_scale is not None: + scale = get_layer_scale(layer_id) + else: + scale = 1. + + parameter_group_names[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "learning_rate": scale + } + parameter_group_vars[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "learning_rate": scale + } + + parameter_group_vars[group_name]["params"].append(param) + parameter_group_names[group_name]["params"].append(name) + return list(parameter_group_vars.values()) + class AdamWDL(object): """ From 412ec1df262c660bb52d37a0c5aa0d3f4f54c424 Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Thu, 18 May 2023 19:47:33 +0800 Subject: [PATCH 2/7] feat(BeitV2): remove print and modify yaml file --- ppcls/arch/backbone/model_zoo/vqkd.py | 1 - .../BeitV2/BeitV2_base_patch16_224_ft.yaml | 11 ++++- .../BeitV2/BeitV2_base_patch16_224_pt.yaml | 48 ++++--------------- ppcls/engine/train/train.py | 3 +- ppcls/optimizer/optimizer.py | 1 - requirements.txt | 1 + 6 files changed, 22 insertions(+), 43 deletions(-) diff --git a/ppcls/arch/backbone/model_zoo/vqkd.py b/ppcls/arch/backbone/model_zoo/vqkd.py index 08140d4bb4..5a5598f4fc 100644 --- a/ppcls/arch/backbone/model_zoo/vqkd.py +++ b/ppcls/arch/backbone/model_zoo/vqkd.py @@ -49,7 +49,6 @@ def __init__(self, **kwargs ): super().__init__() - print(kwargs) if decoder_config['in_chans'] != embed_dim: print(f"Rewrite the in_chans in decoder from {decoder_config['in_chans']} to {embed_dim}") decoder_config['in_chans'] = embed_dim diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml index 2b72bdf7d5..b5893cac77 100644 --- a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml @@ -4,7 +4,7 @@ Global: pretrained_model: null output_dir: ./output/ device: gpu - save_interval: 50 + save_interval: 1 eval_during_train: True eval_interval: 1 epochs: 100 @@ -15,7 +15,14 @@ Global: save_inference_dir: ./inference to_static: False seed: 0 - distributed: 4 + distributed: 8 + +AMP: + scale_loss: 65536.0 + use_dynamic_loss_scaling: True + incr_every_n_steps: 2000 + # O1: mixed fp16 + level: O1 # model architecture Arch: diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml index eacaeea976..675ca79ee7 100644 --- a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml @@ -2,7 +2,7 @@ Global: checkpoints: null pretrained_model: null - output_dir: ./output/beitv2_clip_dist + output_dir: ./output/ device: gpu save_interval: 1 eval_during_train: False @@ -16,7 +16,7 @@ Global: save_inference_dir: ./inference to_static: False seed: 0 - distributed: 4 + distributed: 8 AMP: scale_loss: 65536.0 @@ -29,7 +29,7 @@ AMP: Arch: name: "Beitv2Model" drop_path_rate : 0.1 - class_num: &class_num 1000 + class_num: 1000 is_beitv2: True # if not null, its lengths should be same as models pretrained_list: @@ -42,7 +42,7 @@ Arch: - Teacher: name: vqkd_encoder_base_decoder_3x768x12_clip pretrained: True - pretrained_weight: /home/aistudio/weight/vqkd.pdparams + pretrained_weight: ./dataset/vqkd.pdparams as_tokenzer: False img_size: 224 n_code: 8192 @@ -50,7 +50,7 @@ Arch: - Student: name: beit_base_patch16_224_8k_vocab_cls_pt pretrained: True - pretrained_weight: /home/aistudio/weight/pretrain_model.pdparams + pretrained_weight: ./dataset/pretrain_model.pdparams drop_path_rate: 0.1 use_shared_rel_pos_bias: True use_abs_pos_emb: False @@ -75,14 +75,14 @@ Optimizer: beta1: 0.9 beta2: 0.98 momentum: 0.9 - weight_decay: 1e-4 - epsilon: 1e-8 + weight_decay: 0.05 #multi precision: True no_weight_decay_name: pos_embed cls_token .bias norm gamma + one_dim_param_no_weight_decay: True # Ir自定义 lr: name: Cosine - learning_rate: 0.0015 + learning_rate: 1.5e-3 T_max: 200 eta_min: 1e-5 warmup_steps: 10 @@ -94,8 +94,8 @@ DataLoader: Train: dataset: name: BEiT_ImageNet - image_root: /home/aistudio/data/data89857/ILSVRC2012mini/ - cls_label_path: /home/aistudio/data/data89857/ILSVRC2012mini/train_list.txt + image_root: ./dataset/ILSVRC2012/ + cls_label_path: ./dataset/ILSVRC2012/train_list.txt transform_ops: - DecodeImage: to_rgb: True, @@ -135,31 +135,3 @@ DataLoader: loader: num_workers: 4 use_shared_memory: True - - Eval: - dataset: - name: ImageNetDataset - image_root: /home/aistudio/data/data89857/ILSVRC2012mini/val/ - cls_label_path: /home/aistudio/data/data89857/ILSVRC2012mini/val_list.txt - transform_ops: - - DecodeImage: - to_rgb: True - channel_first: False - - ResizeImage: - resize_short: 256 - - CropImage: - size: 224 - - NormalizeImage: - scale: 1.0/255.0 - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: '' - sampler: - name: DistributedBatchSampler - batch_size: 256 - drop_last: False - shuffle: False - loader: - num_workers: 8 - use_shared_memory: True - diff --git a/ppcls/engine/train/train.py b/ppcls/engine/train/train.py index b93cc01d82..5d638d79c2 100644 --- a/ppcls/engine/train/train.py +++ b/ppcls/engine/train/train.py @@ -70,7 +70,8 @@ def train_epoch(engine, epoch_id, print_batch_step): scaled.backward() if (iter_id + 1) % engine.update_freq == 0: for i in range(len(engine.optimizer)): - engine.scaler.minimize(engine.optimizer[i], scaled) + engine.scaler.step(engine.optimizer[i]) + engine.scaler.update() else: loss.backward() if (iter_id + 1) % engine.update_freq == 0: diff --git a/ppcls/optimizer/optimizer.py b/ppcls/optimizer/optimizer.py index 70d81efd67..31b8c01892 100644 --- a/ppcls/optimizer/optimizer.py +++ b/ppcls/optimizer/optimizer.py @@ -341,7 +341,6 @@ def _get_parameter_groups(self, model, weight_decay=0.05, skip_list=(), get_num_ parameter_group_names = {} parameter_group_vars = {} for name, param in model.named_parameters(): - print(param.stop_gradient) if param.stop_gradient: continue diff --git a/requirements.txt b/requirements.txt index b46ed61faf..1179411ed7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ scikit-learn>=0.21.0 gast==0.3.3 faiss-cpu easydict +einops From 4f523745a505d5242171fd7910711d2925a7983d Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Mon, 22 May 2023 20:16:37 +0800 Subject: [PATCH 3/7] fix(yaml):modify ft from 4 to 8 and fix pt bug --- ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml | 4 ++-- ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml index b5893cac77..c6d5fd6413 100644 --- a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_ft.yaml @@ -64,8 +64,8 @@ Optimizer: lr: # for 8 cards name: Cosine - learning_rate: 25e-5 - eta_min: 5e-7 + learning_rate: 5e-4 + eta_min: 1e-6 warmup_epoch: 20 warmup_start_lr: 0 diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml index 675ca79ee7..e3676b0309 100644 --- a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml @@ -16,7 +16,7 @@ Global: save_inference_dir: ./inference to_static: False seed: 0 - distributed: 8 + distributed: 16 AMP: scale_loss: 65536.0 @@ -83,7 +83,7 @@ Optimizer: lr: name: Cosine learning_rate: 1.5e-3 - T_max: 200 + T_max: 300 eta_min: 1e-5 warmup_steps: 10 warmup_start_Ir: 0 From a7f99b710592b0b01afa088218b00ec227af70b0 Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Wed, 24 May 2023 19:56:53 +0800 Subject: [PATCH 4/7] feat(beitv2): modify how weights are loaded when finetune --- ppcls/arch/backbone/model_zoo/modeling_finetune.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ppcls/arch/backbone/model_zoo/modeling_finetune.py b/ppcls/arch/backbone/model_zoo/modeling_finetune.py index a6b12cf761..6f2ffbdbbe 100644 --- a/ppcls/arch/backbone/model_zoo/modeling_finetune.py +++ b/ppcls/arch/backbone/model_zoo/modeling_finetune.py @@ -582,6 +582,11 @@ def beit_base_patch16_224(pretrained=False, finetune_weight=None, model_filter_n all_keys = list(checkpoint_model.keys()) for key in all_keys: + if "Teacher" in key: + checkpoint_model.pop(key) + elif "Student" in key: + checkpoint_model[key.strip("Student.")] = checkpoint_model.pop(key) + if "relative_position_index" in key: checkpoint_model.pop(key) From 127f54c61908ed28447530b09fb5966a5bb1b813 Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Thu, 25 May 2023 18:46:28 +0800 Subject: [PATCH 5/7] feat(BeitV2): modify the way of weight init --- ppcls/arch/backbone/model_zoo/modeling_pretrain.py | 5 +++-- .../configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ppcls/arch/backbone/model_zoo/modeling_pretrain.py b/ppcls/arch/backbone/model_zoo/modeling_pretrain.py index b4153bcba4..8e795eec8b 100644 --- a/ppcls/arch/backbone/model_zoo/modeling_pretrain.py +++ b/ppcls/arch/backbone/model_zoo/modeling_pretrain.py @@ -358,7 +358,8 @@ def beit_base_patch16_224_8k_vocab_cls_pt(pretrained=False, pretrained_weight=No model = VisionTransformerForMaskedImageModelingCLS( patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, norm_layer="nn.LayerNorm", vocab_size=vocab_size, **kwargs) - weight = paddle.load(pretrained_weight) - model.set_dict(weight) + if pretrained: + weight = paddle.load(pretrained_weight) + model.set_dict(weight) model.default_cfg = _cfg() return model \ No newline at end of file diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml index e3676b0309..01c48c7073 100644 --- a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml @@ -49,8 +49,7 @@ Arch: code_dim: 32 - Student: name: beit_base_patch16_224_8k_vocab_cls_pt - pretrained: True - pretrained_weight: ./dataset/pretrain_model.pdparams + pretrained: False drop_path_rate: 0.1 use_shared_rel_pos_bias: True use_abs_pos_emb: False From ca1578a124ea64a46e4e909e2f2a1a2fae78033d Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Mon, 29 May 2023 14:34:17 +0800 Subject: [PATCH 6/7] feat(BeitV2): modify some format for review --- ppcls/arch/backbone/__init__.py | 3 +- .../{modeling_finetune.py => BeiTV2.py} | 346 ++++++++++++++++- .../backbone/model_zoo/modeling_pretrain.py | 365 ------------------ .../backbone/model_zoo/norm_ema_quantizer.py | 242 ------------ ppcls/arch/backbone/model_zoo/vqkd.py | 243 +++++++++++- .../BeitV2/BeitV2_base_patch16_224_pt.yaml | 2 +- ppcls/engine/engine.py | 4 - ppcls/engine/train/__init__.py | 1 + ppcls/engine/train/train.py | 6 +- ppcls/engine/train/train_mask_samples.py | 110 ++++++ 10 files changed, 699 insertions(+), 623 deletions(-) rename ppcls/arch/backbone/model_zoo/{modeling_finetune.py => BeiTV2.py} (64%) delete mode 100644 ppcls/arch/backbone/model_zoo/modeling_pretrain.py delete mode 100644 ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py create mode 100644 ppcls/engine/train/train_mask_samples.py diff --git a/ppcls/arch/backbone/__init__.py b/ppcls/arch/backbone/__init__.py index 9dc12bd3dd..1f964c482a 100644 --- a/ppcls/arch/backbone/__init__.py +++ b/ppcls/arch/backbone/__init__.py @@ -76,8 +76,7 @@ from .model_zoo.nextvit import NextViT_small_224, NextViT_base_224, NextViT_large_224, NextViT_small_384, NextViT_base_384, NextViT_large_384 from .model_zoo.cae import cae_base_patch16_224, cae_large_patch16_224 from .model_zoo.vqkd import vqkd_encoder_base_decoder_3x768x12_clip -from .model_zoo.modeling_pretrain import beit_base_patch16_224_8k_vocab_cls_pt -from .model_zoo.modeling_finetune import beit_base_patch16_224 +from .model_zoo.BeiTV2 import beit_base_patch16_224, beit_base_patch16_224_8k_vocab_cls_pt from .variant_models.resnet_variant import ResNet50_last_stage_stride1 from .variant_models.resnet_variant import ResNet50_adaptive_max_pool2d diff --git a/ppcls/arch/backbone/model_zoo/modeling_finetune.py b/ppcls/arch/backbone/model_zoo/BeiTV2.py similarity index 64% rename from ppcls/arch/backbone/model_zoo/modeling_finetune.py rename to ppcls/arch/backbone/model_zoo/BeiTV2.py index 6f2ffbdbbe..8a431dec65 100644 --- a/ppcls/arch/backbone/model_zoo/modeling_finetune.py +++ b/ppcls/arch/backbone/model_zoo/BeiTV2.py @@ -10,6 +10,7 @@ from paddle.nn.initializer import TruncatedNormal, Constant, Normal from collections import OrderedDict + def _cfg(url='', **kwargs): return { 'url': url, @@ -35,7 +36,7 @@ def drop_path(x, drop_prob=0., training=False): """ if drop_prob == 0. or not training: return x - keep_prob = paddle.to_tensor(1 - drop_prob) + keep_prob = paddle.to_tensor(1 - drop_prob).astype(x.dtype) shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) random_tensor = paddle.floor(random_tensor) # binarize @@ -545,8 +546,6 @@ def get_intermediate_layers(self, x, use_last_norm=False): return features - - def beit_base_patch16_224(pretrained=False, finetune_weight=None, model_filter_name='', **kwargs): model = VisionTransformer( patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, # qkv_bias=True, @@ -676,4 +675,345 @@ def geometric_progression(a, r, n): checkpoint_model['pos_embed'] = new_pos_embed model.set_dict(checkpoint_model) model.default_cfg = _cfg() + return model + +''' pretrain ''' + +class VisionTransformerForMaskedImageModeling(nn.Layer): + def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02): + super().__init__() + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + self.num_heads = num_heads + + self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + self.add_parameter("cls_token", self.cls_token) + self.mask_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) + self.add_parameter("mask_token", self.mask_token) + if use_abs_pos_emb: + self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_) + self.add_parameter("pos_embed", self.pos_embed) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = np.linspace(0, drop_path_rate, depth, dtype=np.float32) + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + attn_head_dim=attn_head_dim) + for i in range(depth) + ]) + self.norm = eval(norm_layer)(embed_dim, epsilon=1e-6) + + self.init_std = init_std + self.lm_head = nn.Linear(embed_dim, vocab_size) + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed) + trunc_normal_(self.cls_token) + trunc_normal_(self.mask_token) + trunc_normal_(self.lm_head.weight) + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) + param = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, layer_id + 1) + rescale(layer.mlp.fc2.weight, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + elif isinstance(m, nn.Conv2D): + trunc_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def get_num_layers(self): + return len(self.blocks) + + def forward_features(self, x, bool_masked_pos): + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + + x = self.norm(x) + return x + + + def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.forward_features(x, bool_masked_pos=bool_masked_pos) + x = x[:, 1:] + if return_patch_tokens: + return x + if return_all_tokens: + return self.lm_head(x) + else: + # return the masked tokens + return self.lm_head(x[bool_masked_pos]) + + def forward_return_qkv(self, x, bool_masked_pos=None, split_out_as_qkv=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x, rel_pos_bias=rel_pos_bias) + else: + x, qkv = blk(x, rel_pos_bias=rel_pos_bias, return_qkv=True) + + if split_out_as_qkv: + x = self.norm(x) + x = self.lm_head(x) + q, k, v = x.chunks(3, axis=-1) + b, n, c = q.shape + q = q.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + k = k.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + v = v.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) + return x, q, k, v + else: + x = self.norm(x) + x = x[:, 1:] + x = self.lm_head(x[bool_masked_pos]) + + q, k, v = qkv[0], qkv[1], qkv[2] + + return x, q, k, v + + def forward_intermediate(self, x, bool_masked_pos=None, layer_id=12): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + if isinstance(layer_id, list): + output_list = [] + for l, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + if l in layer_id: + output_list.append(x[:, 1:]) + return output_list + elif isinstance(layer_id, int): + for l, blk in enumerate(self.blocks): + if l < layer_id: + x = blk(x, rel_pos_bias=rel_pos_bias) + elif l == layer_id: + x = blk.norm1(x) + else: + break + return x[:, 1:] + else: + raise NotImplementedError(f"Not support for layer id is {layer_id} now!") + + def get_last_selfattention(self, x): + x = self.patch_embed(x) + batch_size, seq_len, _ = x.shape + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.patch_embed + x = self.pos_drop(x) + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + + for i, blk in enumerate(self.blocks): + if i < len(self.blocks) - 1: + x = blk(x, rel_pos_bias=rel_pos_bias) + else: + # return attention of the last block + return blk(x, rel_pos_bias=rel_pos_bias, return_attention=True) + + +class VisionTransformerForMaskedImageModelingCLS(VisionTransformerForMaskedImageModeling): + def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, + use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02, + early_layers=6, head_layers=2, shared_lm_head=True): + super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, vocab_size=vocab_size, embed_dim=embed_dim, depth=depth, + num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, + drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, attn_head_dim=attn_head_dim, + use_abs_pos_emb=use_abs_pos_emb, use_rel_pos_bias=use_rel_pos_bias, use_shared_rel_pos_bias=use_shared_rel_pos_bias, init_std=init_std) + + self.early_layers = early_layers + print(f'early layer {early_layers}, late layer {depth - early_layers}, condenser head layers {head_layers}, shared_lm_head {shared_lm_head}') + + dpr = np.linspace(0, drop_path_rate, max(depth, early_layers + head_layers), dtype=np.float32) + self.cls_pt_layers = nn.LayerList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + init_values=init_values, + window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, + attn_head_dim=attn_head_dim) + for i in range(early_layers, early_layers + head_layers) + ]) + self.fix_init_cls_pt_weight() + + self.shared_lm_head = shared_lm_head + if not self.shared_lm_head: + self.cls_pt_norm = norm_layer(embed_dim) + self.cls_pt_lm_head = nn.Linear(embed_dim, vocab_size) + + self.cls_pt_norm.apply(self._init_weights) + self.cls_pt_lm_head.apply(self._init_weights) + + def fix_init_cls_pt_weight(self): + def rescale(param, layer_id): + x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) + param = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight, self.early_layers + layer_id + 1) + rescale(layer.mlp.fc2.weight, self.early_layers + layer_id + 1) + + def forward_features(self, x, bool_masked_pos): + x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) + batch_size, seq_len, _ = x.shape + + cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks + mask_token = self.mask_token.expand((batch_size, seq_len, -1)) + + # replace the masked visual tokens by mask_token + w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) + x = x * (1 - w) + mask_token * w + + x = paddle.concat((cls_tokens, x), axis=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for i, blk in enumerate(self.blocks): + x = blk(x, rel_pos_bias=rel_pos_bias) + if i + 1 == self.early_layers: + early_states = x[:, 1:] + + x_cls_pt = paddle.concat((x[:, 0].unsqueeze(1), early_states), axis=1) + for blk in self.cls_pt_layers: + x_cls_pt = blk(x_cls_pt, rel_pos_bias=rel_pos_bias) + + return self.norm(x), self.norm(x_cls_pt) if self.shared_lm_head else self.cls_pt_norm(x_cls_pt) + + def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): + if bool_masked_pos is None: + bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) + x, x_cls_pt = self.forward_features(x, bool_masked_pos=bool_masked_pos) + x = x[:, 1:] + x_cls_pt = x_cls_pt[:, 1:] + if return_patch_tokens: + return [x, x_cls_pt] + if return_all_tokens: + return [self.lm_head(x), self.lm_head(x_cls_pt) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt)] + else: + # return the masked tokens + return [self.lm_head(x[bool_masked_pos]), self.lm_head(x_cls_pt[bool_masked_pos]) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt[bool_masked_pos])] + + + +def beit_base_patch16_224_8k_vocab_cls_pt(pretrained=False, pretrained_weight=None, **kwargs): + if "num_classes" in kwargs: + _ = kwargs.pop("num_classes") + if 'vocab_size' in kwargs: + vocab_size = kwargs['vocab_size'] + _ = kwargs.pop("vocab_size") + else: + vocab_size = 8192 + model = VisionTransformerForMaskedImageModelingCLS( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, + norm_layer="nn.LayerNorm", vocab_size=vocab_size, **kwargs) + if pretrained: + weight = paddle.load(pretrained_weight) + model.set_dict(weight) + model.default_cfg = _cfg() return model \ No newline at end of file diff --git a/ppcls/arch/backbone/model_zoo/modeling_pretrain.py b/ppcls/arch/backbone/model_zoo/modeling_pretrain.py deleted file mode 100644 index 8e795eec8b..0000000000 --- a/ppcls/arch/backbone/model_zoo/modeling_pretrain.py +++ /dev/null @@ -1,365 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Code was heavily based on https://github.com/facebookresearch/deit -# reference: https://arxiv.org/abs/2012.12877 - -import math -import numpy as np -import paddle -import paddle.nn as nn - -from paddle.nn.initializer import TruncatedNormal, Constant, Normal -from .modeling_finetune import Block, PatchEmbed, RelativePositionBias, _cfg, zeros_, ones_, Identity -trunc_normal_ = TruncatedNormal(std=.02) - - -class VisionTransformerForMaskedImageModeling(nn.Layer): - def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, - num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., - drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, - use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02): - super().__init__() - self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models - - self.patch_embed = PatchEmbed( - img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) - num_patches = self.patch_embed.num_patches - self.num_heads = num_heads - - self.cls_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) - self.add_parameter("cls_token", self.cls_token) - self.mask_token = self.create_parameter(shape=(1, 1, embed_dim), default_initializer=zeros_) - self.add_parameter("mask_token", self.mask_token) - if use_abs_pos_emb: - self.pos_embed = self.create_parameter(shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_) - self.add_parameter("pos_embed", self.pos_embed) - else: - self.pos_embed = None - self.pos_drop = nn.Dropout(p=drop_rate) - - if use_shared_rel_pos_bias: - self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) - else: - self.rel_pos_bias = None - - dpr = np.linspace(0, drop_path_rate, depth, dtype=np.float32) - self.use_rel_pos_bias = use_rel_pos_bias - self.blocks = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - init_values=init_values, - window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, - attn_head_dim=attn_head_dim) - for i in range(depth) - ]) - self.norm = eval(norm_layer)(embed_dim, epsilon=1e-6) - - self.init_std = init_std - self.lm_head = nn.Linear(embed_dim, vocab_size) - - if self.pos_embed is not None: - trunc_normal_(self.pos_embed) - trunc_normal_(self.cls_token) - trunc_normal_(self.mask_token) - trunc_normal_(self.lm_head.weight) - self.apply(self._init_weights) - self.fix_init_weight() - - def fix_init_weight(self): - def rescale(param, layer_id): - x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) - param = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight, layer_id + 1) - rescale(layer.mlp.fc2.weight, layer_id + 1) - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight) - if isinstance(m, nn.Linear) and m.bias is not None: - zeros_(m.bias) - elif isinstance(m, nn.LayerNorm): - zeros_(m.bias) - ones_(m.weight) - elif isinstance(m, nn.Conv2D): - trunc_normal_(m.weight) - if m.bias is not None: - zeros_(m.bias) - - def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - def get_num_layers(self): - return len(self.blocks) - - def forward_features(self, x, bool_masked_pos): - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for i, blk in enumerate(self.blocks): - x = blk(x, rel_pos_bias=rel_pos_bias) - - x = self.norm(x) - return x - - - def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x = self.forward_features(x, bool_masked_pos=bool_masked_pos) - x = x[:, 1:] - if return_patch_tokens: - return x - if return_all_tokens: - return self.lm_head(x) - else: - # return the masked tokens - return self.lm_head(x[bool_masked_pos]) - - def forward_return_qkv(self, x, bool_masked_pos=None, split_out_as_qkv=False): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for i, blk in enumerate(self.blocks): - if i < len(self.blocks) - 1: - x = blk(x, rel_pos_bias=rel_pos_bias) - else: - x, qkv = blk(x, rel_pos_bias=rel_pos_bias, return_qkv=True) - - if split_out_as_qkv: - x = self.norm(x) - x = self.lm_head(x) - q, k, v = x.chunks(3, axis=-1) - b, n, c = q.shape - q = q.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) - k = k.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) - v = v.reshape(b, n, self.num_heads, -1).transpose([0, 2, 1, 3]) - return x, q, k, v - else: - x = self.norm(x) - x = x[:, 1:] - x = self.lm_head(x[bool_masked_pos]) - - q, k, v = qkv[0], qkv[1], qkv[2] - - return x, q, k, v - - def forward_intermediate(self, x, bool_masked_pos=None, layer_id=12): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - if isinstance(layer_id, list): - output_list = [] - for l, blk in enumerate(self.blocks): - x = blk(x, rel_pos_bias=rel_pos_bias) - if l in layer_id: - output_list.append(x[:, 1:]) - return output_list - elif isinstance(layer_id, int): - for l, blk in enumerate(self.blocks): - if l < layer_id: - x = blk(x, rel_pos_bias=rel_pos_bias) - elif l == layer_id: - x = blk.norm1(x) - else: - break - return x[:, 1:] - else: - raise NotImplementedError(f"Not support for layer id is {layer_id} now!") - - def get_last_selfattention(self, x): - x = self.patch_embed(x) - batch_size, seq_len, _ = x.shape - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.patch_embed - x = self.pos_drop(x) - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - - for i, blk in enumerate(self.blocks): - if i < len(self.blocks) - 1: - x = blk(x, rel_pos_bias=rel_pos_bias) - else: - # return attention of the last block - return blk(x, rel_pos_bias=rel_pos_bias, return_attention=True) - - -class VisionTransformerForMaskedImageModelingCLS(VisionTransformerForMaskedImageModeling): - def __init__(self, img_size=224, patch_size=16, in_chans=3, vocab_size=8192, embed_dim=768, depth=12, - num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., - drop_path_rate=0., norm_layer=None, init_values=None, attn_head_dim=None, - use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, init_std=0.02, - early_layers=6, head_layers=2, shared_lm_head=True): - super().__init__(img_size=img_size, patch_size=patch_size, in_chans=in_chans, vocab_size=vocab_size, embed_dim=embed_dim, depth=depth, - num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, - drop_path_rate=drop_path_rate, norm_layer=norm_layer, init_values=init_values, attn_head_dim=attn_head_dim, - use_abs_pos_emb=use_abs_pos_emb, use_rel_pos_bias=use_rel_pos_bias, use_shared_rel_pos_bias=use_shared_rel_pos_bias, init_std=init_std) - - self.early_layers = early_layers - print(f'early layer {early_layers}, late layer {depth - early_layers}, condenser head layers {head_layers}, shared_lm_head {shared_lm_head}') - - dpr = np.linspace(0, drop_path_rate, max(depth, early_layers + head_layers), dtype=np.float32) - self.cls_pt_layers = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - init_values=init_values, - window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None, - attn_head_dim=attn_head_dim) - for i in range(early_layers, early_layers + head_layers) - ]) - self.fix_init_cls_pt_weight() - - self.shared_lm_head = shared_lm_head - if not self.shared_lm_head: - self.cls_pt_norm = norm_layer(embed_dim) - self.cls_pt_lm_head = nn.Linear(embed_dim, vocab_size) - - self.cls_pt_norm.apply(self._init_weights) - self.cls_pt_lm_head.apply(self._init_weights) - - def fix_init_cls_pt_weight(self): - def rescale(param, layer_id): - x = param.divide(paddle.to_tensor([math.sqrt(2.0 * layer_id)])) - param = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - for layer_id, layer in enumerate(self.blocks): - rescale(layer.attn.proj.weight, self.early_layers + layer_id + 1) - rescale(layer.mlp.fc2.weight, self.early_layers + layer_id + 1) - - def forward_features(self, x, bool_masked_pos): - x = self.patch_embed(x, bool_masked_pos=bool_masked_pos) - batch_size, seq_len, _ = x.shape - - cls_tokens = self.cls_token.expand((batch_size, -1, -1)) # stole cls_tokens impl from Phil Wang, thanks - mask_token = self.mask_token.expand((batch_size, seq_len, -1)) - - # replace the masked visual tokens by mask_token - w = bool_masked_pos.unsqueeze(-1).astype(mask_token.dtype) - x = x * (1 - w) + mask_token * w - - x = paddle.concat((cls_tokens, x), axis=1) - if self.pos_embed is not None: - x = x + self.pos_embed - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None - for i, blk in enumerate(self.blocks): - x = blk(x, rel_pos_bias=rel_pos_bias) - if i + 1 == self.early_layers: - early_states = x[:, 1:] - - x_cls_pt = paddle.concat((x[:, 0].unsqueeze(1), early_states), axis=1) - for blk in self.cls_pt_layers: - x_cls_pt = blk(x_cls_pt, rel_pos_bias=rel_pos_bias) - - return self.norm(x), self.norm(x_cls_pt) if self.shared_lm_head else self.cls_pt_norm(x_cls_pt) - - def forward(self, x, bool_masked_pos=None, return_all_tokens=False, return_patch_tokens=False): - if bool_masked_pos is None: - bool_masked_pos = paddle.zeros([x.shape[0], self.patch_embed.num_patches], dtype=paddle.bool).set_device(x.device) - x, x_cls_pt = self.forward_features(x, bool_masked_pos=bool_masked_pos) - x = x[:, 1:] - x_cls_pt = x_cls_pt[:, 1:] - if return_patch_tokens: - return [x, x_cls_pt] - if return_all_tokens: - return [self.lm_head(x), self.lm_head(x_cls_pt) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt)] - else: - # return the masked tokens - return [self.lm_head(x[bool_masked_pos]), self.lm_head(x_cls_pt[bool_masked_pos]) if self.shared_lm_head else self.cls_pt_lm_head(x_cls_pt[bool_masked_pos])] - - - -def beit_base_patch16_224_8k_vocab_cls_pt(pretrained=False, pretrained_weight=None, **kwargs): - if "num_classes" in kwargs: - _ = kwargs.pop("num_classes") - if 'vocab_size' in kwargs: - vocab_size = kwargs['vocab_size'] - _ = kwargs.pop("vocab_size") - else: - vocab_size = 8192 - model = VisionTransformerForMaskedImageModelingCLS( - patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, - norm_layer="nn.LayerNorm", vocab_size=vocab_size, **kwargs) - if pretrained: - weight = paddle.load(pretrained_weight) - model.set_dict(weight) - model.default_cfg = _cfg() - return model \ No newline at end of file diff --git a/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py b/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py deleted file mode 100644 index 52fe6ac11b..0000000000 --- a/ppcls/arch/backbone/model_zoo/norm_ema_quantizer.py +++ /dev/null @@ -1,242 +0,0 @@ -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -import paddle.distributed as distributed -from einops import rearrange, repeat - -from .modeling_finetune import zeros_, ones_, Identity - -def l2norm(t): - return F.normalize(t, p=2, axis=-1) - -def ema_inplace(moving_avg, new, decay): - x = moving_avg * decay - x = x + new*(1-decay) - moving_avg = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - -def sample_vectors(samples, num): - num_samples, device = samples.shape[0], samples.device - - if num_samples >= num: - indices = paddle.randperm(num_samples)[:num] - else: - indices = paddle.randint(0, num_samples, [num,]) - - return samples[indices] - -def kmeans(samples, num_clusters, num_iters = 10, use_cosine_sim = False): - dim, dtype, device = samples.shape[-1], samples.dtype, samples.device - - means = sample_vectors(samples, num_clusters) - - for _ in range(num_iters): - if use_cosine_sim: - dists = samples @ means.t() - else: - diffs = rearrange(samples, 'n d -> n () d') \ - - rearrange(means, 'c d -> () c d') - dists = -(diffs ** 2).sum(axis = -1) - - buckets = dists.max(axis = -1).indices - bins = paddle.bincount(buckets, minlength = num_clusters) - zero_mask = bins == 0 - bins_min_clamped = bins.masked_fill(zero_mask, 1) - - new_means = buckets.new_zeros(num_clusters, dim, dtype = dtype) - new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d = dim), samples) - new_means = new_means / bins_min_clamped[..., None] - - if use_cosine_sim: - new_means = l2norm(new_means) - - means = paddle.where(zero_mask[..., None], means, new_means) - - return means, bins - - -class EmbeddingEMA(nn.Layer): - def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): - super().__init__() - self.num_tokens = num_tokens - self.codebook_dim = codebook_dim - self.decay = decay - self.eps = eps - if codebook_init_path == '': - if not kmeans_init: - weight = paddle.randn([num_tokens, codebook_dim]) - weight = l2norm(weight) - else: - weight = paddle.zeros([num_tokens, codebook_dim]) - self.register_buffer('initted', paddle.to_tensor([not kmeans_init], dtype='float32')) - else: - print(f"load init codebook weight from {codebook_init_path}") - codebook_ckpt_weight = paddle.load(codebook_init_path, map_location='cpu') - weight = codebook_ckpt_weight.clone() - self.register_buffer('initted', paddle.to_tensor([True])) - - self.weight = paddle.create_parameter(shape=weight.shape, - dtype=str(weight.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(weight)) - self.cluster_size = self.create_parameter(shape=[num_tokens], default_initializer=zeros_) - self.add_parameter("cluster_size", self.cluster_size) - self.embed_avg = paddle.create_parameter(shape=weight.shape, - dtype=str(weight.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(weight)) - self.update = True - - def init_embed_(self, data): - if self.initted: - return - print("Performing Kemans init for codebook") - embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True) - self.weight = paddle.create_parameter(shape=embed.shape, - dtype=str(embed.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(embed)) - self.cluster_size = paddle.create_parameter(shape=cluster_size.shape, - dtype=str(cluster_size.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(cluster_size)) - self.initted = paddle.create_parameter(shape=[1], - dtype="bool", - default_initializer=paddle.nn.initializer.Assign(paddle.to_tensor([True]))) - - def forward(self, embed_id): - return F.embedding(embed_id, self.weight) - - def cluster_size_ema_update(self, new_cluster_size): - x = self.cluster_size.multiply(self.decay) - x = x.add(new_cluster_size*(1 - self.decay)) - self.cluster_size = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - def embed_avg_ema_update(self, new_embed_avg): - x = self.cluster_size.multiply(self.decay) - x = x.add(new_embed_avg*(1 - self.decay)) - self.embed_avg = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - def weight_update(self, num_tokens): - n = self.cluster_size.sum() - smoothed_cluster_size = ( - (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n - ) - #normalize embedding average with smoothed cluster size - embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) - # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1)) - self.weight = paddle.create_parameter(shape=embed_normalized.shape, - dtype=str(embed_normalized.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(embed_normalized)) - - -def norm_ema_inplace(moving_avg, new, decay): - x = moving_avg.multiply(paddle.to_tensor(decay)) - x = x.add(new*(1 - decay)) - x = l2norm(x) - moving_avg = paddle.create_parameter(shape=x.shape, - dtype=str(x.numpy().dtype), - default_initializer=paddle.nn.initializer.Assign(x)) - - - -class NormEMAVectorQuantizer(nn.Layer): - def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, - statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): - super().__init__() - self.codebook_dim = embedding_dim - self.num_tokens = n_embed - self.beta = beta - self.decay = decay - - # learnable = True if orthogonal_reg_weight > 0 else False - self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) - - self.statistic_code_usage = statistic_code_usage - if statistic_code_usage: - self.register_buffer('cluster_size', paddle.zeros([n_embed])) - # if distributed.is_available() and distributed.is_initialized(): - # print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") - # self.all_reduce_fn = distributed.all_reduce - # else: - # self.all_reduce_fn = Identity - # self.all_reduce_fn = paddle.distributed.all_reduce() - - def reset_cluster_size(self, device): - if self.statistic_code_usage: - self.register_buffer('cluster_size', paddle.zeros([self.num_tokens])) - self.cluster_size = self.cluster_size.to(device) - - def _masked_fill(self, x, mask, value): - y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) - - def forward(self, z): - # reshape z -> (batch, height, width, channel) and flatten - #z, 'b c h w -> b h w c' - b, c, h, w = z.shape - z = paddle.reshape(z, [b, h, w, c]) - # z = rearrange(z, 'b c h w -> b h w c') - z = l2norm(z) - z_flattened = z.reshape([-1, self.codebook_dim]) - - self.embedding.init_embed_(z_flattened) - - d = z_flattened.pow(2).sum(axis=1, keepdim=True) + \ - self.embedding.weight.pow(2).sum(axis=1) - 2 * \ - paddle.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' - - encoding_indices = paddle.argmin(d, axis=1) - - z_q = self.embedding(encoding_indices).reshape(z.shape) - - encodings = F.one_hot(encoding_indices, self.num_tokens).astype(z.dtype) - - if not self.training: - with paddle.no_grad(): - cluster_size = encodings.sum(0) - # self.all_reduce_fn(cluster_size) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(cluster_size) - ema_inplace(self.cluster_size, cluster_size, self.decay) - - if self.training and self.embedding.update: - # EMA cluster size - - bins = encodings.sum(0) - # self.all_reduce_fn(bins) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(bins) - - # self.embedding.cluster_size_ema_update(bins) - ema_inplace(self.cluster_size, bins, self.decay) - - zero_mask = (bins == 0) - # bins = bins.masked_fill(zero_mask, 1.) - bins = self._masked_fill(bins, zero_mask, 1.) - - embed_sum = z_flattened.t() @ encodings - # self.all_reduce_fn(embed_sum) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(embed_sum) - - embed_normalized = (embed_sum / bins.unsqueeze(0)).t() - embed_normalized = l2norm(embed_normalized) - - embed_normalized = paddle.where(zero_mask[..., None], self.embedding.weight, - embed_normalized) - norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) - - # compute loss for embedding - loss = self.beta * F.mse_loss(z_q.detach(), z) - - # preserve gradients - z_q = z + (z_q - z).detach() - - # reshape back to match original input shape - #z_q, 'b h w c -> b c h w' - b, h, w, c = z_q.shape - z_q = paddle.reshape(z_q, [b, c, h, w]) - # z_q = rearrange(z_q, 'b h w c -> b c h w') - return z_q, loss, encoding_indices diff --git a/ppcls/arch/backbone/model_zoo/vqkd.py b/ppcls/arch/backbone/model_zoo/vqkd.py index 5a5598f4fc..f387a9fcf7 100644 --- a/ppcls/arch/backbone/model_zoo/vqkd.py +++ b/ppcls/arch/backbone/model_zoo/vqkd.py @@ -19,9 +19,13 @@ import paddle import paddle.nn as nn from paddle.nn.initializer import TruncatedNormal +import paddle.nn.functional as F +import paddle.distributed as distributed +from einops import rearrange, repeat + +from .BeiTV2 import VisionTransformer, zeros_, ones_, Identity + -from .modeling_finetune import VisionTransformer, zeros_, ones_ -from .norm_ema_quantizer import NormEMAVectorQuantizer MODEL_URLS = { "vqkd": @@ -34,6 +38,241 @@ IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) +def l2norm(t): + return F.normalize(t, p=2, axis=-1) + +def ema_inplace(moving_avg, new, decay): + x = moving_avg * decay + x = x + new*(1-decay) + moving_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + +def sample_vectors(samples, num): + num_samples, device = samples.shape[0], samples.device + + if num_samples >= num: + indices = paddle.randperm(num_samples)[:num] + else: + indices = paddle.randint(0, num_samples, [num,]) + + return samples[indices] + +def kmeans(samples, num_clusters, num_iters = 10, use_cosine_sim = False): + dim, dtype, device = samples.shape[-1], samples.dtype, samples.device + + means = sample_vectors(samples, num_clusters) + + for _ in range(num_iters): + if use_cosine_sim: + dists = samples @ means.t() + else: + diffs = rearrange(samples, 'n d -> n () d') \ + - rearrange(means, 'c d -> () c d') + dists = -(diffs ** 2).sum(axis = -1) + + buckets = dists.max(axis = -1).indices + bins = paddle.bincount(buckets, minlength = num_clusters) + zero_mask = bins == 0 + bins_min_clamped = bins.masked_fill(zero_mask, 1) + + new_means = buckets.new_zeros(num_clusters, dim, dtype = dtype) + new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d = dim), samples) + new_means = new_means / bins_min_clamped[..., None] + + if use_cosine_sim: + new_means = l2norm(new_means) + + means = paddle.where(zero_mask[..., None], means, new_means) + + return means, bins + + +class EmbeddingEMA(nn.Layer): + def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): + super().__init__() + self.num_tokens = num_tokens + self.codebook_dim = codebook_dim + self.decay = decay + self.eps = eps + if codebook_init_path == '': + if not kmeans_init: + weight = paddle.randn([num_tokens, codebook_dim]) + weight = l2norm(weight) + else: + weight = paddle.zeros([num_tokens, codebook_dim]) + self.register_buffer('initted', paddle.to_tensor([not kmeans_init], dtype='float32')) + else: + print(f"load init codebook weight from {codebook_init_path}") + codebook_ckpt_weight = paddle.load(codebook_init_path, map_location='cpu') + weight = codebook_ckpt_weight.clone() + self.register_buffer('initted', paddle.to_tensor([True])) + + self.weight = paddle.create_parameter(shape=weight.shape, + dtype=str(weight.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(weight)) + self.cluster_size = self.create_parameter(shape=[num_tokens], default_initializer=zeros_) + self.add_parameter("cluster_size", self.cluster_size) + self.embed_avg = paddle.create_parameter(shape=weight.shape, + dtype=str(weight.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(weight)) + self.update = True + + def init_embed_(self, data): + if self.initted: + return + print("Performing Kemans init for codebook") + embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True) + self.weight = paddle.create_parameter(shape=embed.shape, + dtype=str(embed.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(embed)) + self.cluster_size = paddle.create_parameter(shape=cluster_size.shape, + dtype=str(cluster_size.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(cluster_size)) + self.initted = paddle.create_parameter(shape=[1], + dtype="bool", + default_initializer=paddle.nn.initializer.Assign(paddle.to_tensor([True]))) + + def forward(self, embed_id): + return F.embedding(embed_id, self.weight) + + def cluster_size_ema_update(self, new_cluster_size): + x = self.cluster_size.multiply(self.decay) + x = x.add(new_cluster_size*(1 - self.decay)) + self.cluster_size = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def embed_avg_ema_update(self, new_embed_avg): + x = self.cluster_size.multiply(self.decay) + x = x.add(new_embed_avg*(1 - self.decay)) + self.embed_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + def weight_update(self, num_tokens): + n = self.cluster_size.sum() + smoothed_cluster_size = ( + (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n + ) + #normalize embedding average with smoothed cluster size + embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) + # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1)) + self.weight = paddle.create_parameter(shape=embed_normalized.shape, + dtype=str(embed_normalized.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(embed_normalized)) + + +def norm_ema_inplace(moving_avg, new, decay): + x = moving_avg.multiply(paddle.to_tensor(decay)) + x = x.add(new*(1 - decay)) + x = l2norm(x) + moving_avg = paddle.create_parameter(shape=x.shape, + dtype=str(x.numpy().dtype), + default_initializer=paddle.nn.initializer.Assign(x)) + + + +class NormEMAVectorQuantizer(nn.Layer): + def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, + statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): + super().__init__() + self.codebook_dim = embedding_dim + self.num_tokens = n_embed + self.beta = beta + self.decay = decay + + # learnable = True if orthogonal_reg_weight > 0 else False + self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) + + self.statistic_code_usage = statistic_code_usage + if statistic_code_usage: + self.register_buffer('cluster_size', paddle.zeros([n_embed])) + # if distributed.is_available() and distributed.is_initialized(): + # print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") + # self.all_reduce_fn = distributed.all_reduce + # else: + # self.all_reduce_fn = Identity + # self.all_reduce_fn = paddle.distributed.all_reduce() + + def reset_cluster_size(self, device): + if self.statistic_code_usage: + self.register_buffer('cluster_size', paddle.zeros([self.num_tokens])) + self.cluster_size = self.cluster_size.to(device) + + def _masked_fill(self, x, mask, value): + y = paddle.full(x.shape, value, x.dtype) + return paddle.where(mask, y, x) + + def forward(self, z): + # reshape z -> (batch, height, width, channel) and flatten + #z, 'b c h w -> b h w c' + b, c, h, w = z.shape + z = paddle.reshape(z, [b, h, w, c]) + # z = rearrange(z, 'b c h w -> b h w c') + z = l2norm(z) + z_flattened = z.reshape([-1, self.codebook_dim]) + + self.embedding.init_embed_(z_flattened) + + d = z_flattened.pow(2).sum(axis=1, keepdim=True) + \ + self.embedding.weight.pow(2).sum(axis=1) - 2 * \ + paddle.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' + + encoding_indices = paddle.argmin(d, axis=1) + + z_q = self.embedding(encoding_indices).reshape(z.shape) + + encodings = F.one_hot(encoding_indices, self.num_tokens).astype(z.dtype) + + if not self.training: + with paddle.no_grad(): + cluster_size = encodings.sum(0) + # self.all_reduce_fn(cluster_size) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(cluster_size) + ema_inplace(self.cluster_size, cluster_size, self.decay) + + if self.training and self.embedding.update: + # EMA cluster size + + bins = encodings.sum(0) + # self.all_reduce_fn(bins) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(bins) + + # self.embedding.cluster_size_ema_update(bins) + ema_inplace(self.cluster_size, bins, self.decay) + + zero_mask = (bins == 0) + # bins = bins.masked_fill(zero_mask, 1.) + bins = self._masked_fill(bins, zero_mask, 1.) + + embed_sum = z_flattened.t() @ encodings + # self.all_reduce_fn(embed_sum) + if paddle.distributed.get_world_size() > 1: + paddle.distributed.all_reduce(embed_sum) + + embed_normalized = (embed_sum / bins.unsqueeze(0)).t() + embed_normalized = l2norm(embed_normalized) + + embed_normalized = paddle.where(zero_mask[..., None], self.embedding.weight, + embed_normalized) + norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) + + # compute loss for embedding + loss = self.beta * F.mse_loss(z_q.detach(), z) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # reshape back to match original input shape + #z_q, 'b h w c -> b c h w' + b, h, w, c = z_q.shape + z_q = paddle.reshape(z_q, [b, c, h, w]) + # z_q = rearrange(z_q, 'b h w c -> b c h w') + return z_q, loss, encoding_indices + class VQKD(nn.Layer): def __init__(self, encoder_config, diff --git a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml index 01c48c7073..5fc486ce23 100644 --- a/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml +++ b/ppcls/configs/ImageNet/BeitV2/BeitV2_base_patch16_224_pt.yaml @@ -5,6 +5,7 @@ Global: output_dir: ./output/ device: gpu save_interval: 1 + train_mode: mask_samples eval_during_train: False eval_interval: 1 epochs: 300 @@ -30,7 +31,6 @@ Arch: name: "Beitv2Model" drop_path_rate : 0.1 class_num: 1000 - is_beitv2: True # if not null, its lengths should be same as models pretrained_list: # if not null, its lengths should be same as models diff --git a/ppcls/engine/engine.py b/ppcls/engine/engine.py index 21d318d8d1..16599db65e 100755 --- a/ppcls/engine/engine.py +++ b/ppcls/engine/engine.py @@ -60,10 +60,6 @@ def __init__(self, config, mode="train"): self.is_rec = True else: self.is_rec = False - if self.config["Arch"].get("is_beitv2", False): - self.is_beitv2 = True - else: - self.is_beitv2 = False # set seed seed = self.config["Global"].get("seed", False) diff --git a/ppcls/engine/train/__init__.py b/ppcls/engine/train/__init__.py index 50bf9037f4..6fce0672d9 100644 --- a/ppcls/engine/train/__init__.py +++ b/ppcls/engine/train/__init__.py @@ -16,3 +16,4 @@ from ppcls.engine.train.train_fixmatch_ccssl import train_epoch_fixmatch_ccssl from ppcls.engine.train.train_progressive import train_epoch_progressive from ppcls.engine.train.train_metabin import train_epoch_metabin +from ppcls.engine.train.train_mask_samples import train_epoch_mask_samples diff --git a/ppcls/engine/train/train.py b/ppcls/engine/train/train.py index 5d638d79c2..7a253706f4 100644 --- a/ppcls/engine/train/train.py +++ b/ppcls/engine/train/train.py @@ -108,9 +108,7 @@ def train_epoch(engine, epoch_id, print_batch_step): def forward(engine, batch): - if not engine.is_rec and not engine.is_beitv2: + if not engine.is_rec: return engine.model(batch[0]) - elif engine.is_rec: - return engine.model(batch[0], batch[1]) else: - return engine.model(batch[0], batch[1], batch[2]) \ No newline at end of file + return engine.model(batch[0], batch[1]) diff --git a/ppcls/engine/train/train_mask_samples.py b/ppcls/engine/train/train_mask_samples.py new file mode 100644 index 0000000000..56c46efc5d --- /dev/null +++ b/ppcls/engine/train/train_mask_samples.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function + +import time +import paddle +from ppcls.engine.train.utils import update_loss, update_metric, log_info, type_name +from ppcls.utils import profiler + + +def train_epoch_mask_samples(engine, epoch_id, print_batch_step): + tic = time.time() + + if not hasattr(engine, "train_dataloader_iter"): + engine.train_dataloader_iter = iter(engine.train_dataloader) + + for iter_id in range(engine.iter_per_epoch): + # fetch data batch from dataloader + try: + batch = next(engine.train_dataloader_iter) + except Exception: + # NOTE: reset DALI dataloader manually + if engine.use_dali: + engine.train_dataloader.reset() + engine.train_dataloader_iter = iter(engine.train_dataloader) + batch = next(engine.train_dataloader_iter) + + profiler.add_profiler_step(engine.config["profiler_options"]) + if iter_id == 5: + for key in engine.time_info: + engine.time_info[key].reset() + engine.time_info["reader_cost"].update(time.time() - tic) + + batch_size = batch[0].shape[0] + if not engine.config["Global"].get("use_multilabel", False): + batch[1] = batch[1].reshape([batch_size, -1]) + engine.global_step += 1 + + # image input + if engine.amp: + amp_level = engine.config["AMP"].get("level", "O1").upper() + with paddle.amp.auto_cast( + custom_black_list={ + "flatten_contiguous_range", "greater_than" + }, + level=amp_level): + out = forward(engine, batch) + loss_dict = engine.train_loss_func(out, batch) + else: + out = forward(engine, batch) + loss_dict = engine.train_loss_func(out, batch) + + # loss + loss = loss_dict["loss"] / engine.update_freq + + # backward & step opt + if engine.amp: + scaled = engine.scaler.scale(loss) + scaled.backward() + if (iter_id + 1) % engine.update_freq == 0: + for i in range(len(engine.optimizer)): + engine.scaler.minimize(engine.optimizer[i], scaled) + else: + loss.backward() + if (iter_id + 1) % engine.update_freq == 0: + for i in range(len(engine.optimizer)): + engine.optimizer[i].step() + + if (iter_id + 1) % engine.update_freq == 0: + # clear grad + for i in range(len(engine.optimizer)): + engine.optimizer[i].clear_grad() + # step lr(by step) + for i in range(len(engine.lr_sch)): + if not getattr(engine.lr_sch[i], "by_epoch", False): + engine.lr_sch[i].step() + # update ema + if engine.ema: + engine.model_ema.update(engine.model) + + # below code just for logging + # update metric_for_logger + update_metric(engine, out, batch, batch_size) + # update_loss_for_logger + update_loss(engine, loss_dict, batch_size) + engine.time_info["batch_cost"].update(time.time() - tic) + if iter_id % print_batch_step == 0: + log_info(engine, batch_size, epoch_id, iter_id) + tic = time.time() + + # step lr(by epoch) + for i in range(len(engine.lr_sch)): + if getattr(engine.lr_sch[i], "by_epoch", False) and \ + type_name(engine.lr_sch[i]) != "ReduceOnPlateau": + engine.lr_sch[i].step() + + +def forward(engine, batch): + return engine.model(batch[0], batch[1], batch[2]) \ No newline at end of file From b0012609ab77a67fe8122b388683e9e1688e9846 Mon Sep 17 00:00:00 2001 From: liuxuewen <18435135529@163.com> Date: Mon, 5 Jun 2023 17:40:51 +0800 Subject: [PATCH 7/7] feat(BeiTV2): fix a bug that modify key name after checkpoint_model --- ppcls/arch/backbone/model_zoo/BeiTV2.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/ppcls/arch/backbone/model_zoo/BeiTV2.py b/ppcls/arch/backbone/model_zoo/BeiTV2.py index 8a431dec65..5cbfa2317b 100644 --- a/ppcls/arch/backbone/model_zoo/BeiTV2.py +++ b/ppcls/arch/backbone/model_zoo/BeiTV2.py @@ -564,6 +564,16 @@ def beit_base_patch16_224(pretrained=False, finetune_weight=None, model_filter_n else: pass checkpoint_model = new_dict + + student_all_keys = list(checkpoint_model.keys()) + for key in student_all_keys: + if "Teacher" in key: + checkpoint_model.pop(key) + elif "Student" in key: + checkpoint_model[key[8:]] = checkpoint_model.pop(key) + else: + continue + state_dict = model.state_dict() for k in ['head.weight', 'head.bias']: if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape: @@ -581,11 +591,6 @@ def beit_base_patch16_224(pretrained=False, finetune_weight=None, model_filter_n all_keys = list(checkpoint_model.keys()) for key in all_keys: - if "Teacher" in key: - checkpoint_model.pop(key) - elif "Student" in key: - checkpoint_model[key.strip("Student.")] = checkpoint_model.pop(key) - if "relative_position_index" in key: checkpoint_model.pop(key)