From f15fd928ffe0afc8015c191d5195bf2fb2c99340 Mon Sep 17 00:00:00 2001 From: Ponku Date: Mon, 1 Aug 2022 15:18:38 +0100 Subject: [PATCH 01/23] Added maxvit architecture and tests --- .../ModelTester.test_max_vit_B_224_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_max_vit_L_224_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_max_vit_S_224_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_max_vit_T_224_expect.pkl | Bin 0 -> 939 bytes test/test_architecture_ops.py | 39 ++ test/test_models.py | 8 + torchvision/models/__init__.py | 1 + torchvision/models/maxvit.py | 595 ++++++++++++++++++ 8 files changed, 643 insertions(+) create mode 100644 test/expect/ModelTester.test_max_vit_B_224_expect.pkl create mode 100644 test/expect/ModelTester.test_max_vit_L_224_expect.pkl create mode 100644 test/expect/ModelTester.test_max_vit_S_224_expect.pkl create mode 100644 test/expect/ModelTester.test_max_vit_T_224_expect.pkl create mode 100644 test/test_architecture_ops.py create mode 100644 torchvision/models/maxvit.py diff --git a/test/expect/ModelTester.test_max_vit_B_224_expect.pkl b/test/expect/ModelTester.test_max_vit_B_224_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2974c870b4b3a2d6de0ca804fc8da17928079350 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66d#8_@)O-{;p4d8ELeR$<~Nt=Z=kT5tc?f6e^8Qbg^zoH*Yaa6kce!$kcJ; z^jQ-&@-*@qCCy#Gu`ObuQFhL1Bg>*yMzh*n$(+dGLJBzu)0hin^Tm1T zp-ezqK{&vh5k$e$Byt=IfFw`=dJ09?jqE2r6rHbtJY?Pa2IzW`UB!>0R|4olm|kcY z1bDNt=|C09G3&yWgAy|c!07D|F2f|SCqdq21LX|{PpAS=CJ69mWdn&Z10hH~L@fZt CQuS8= literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_max_vit_L_224_expect.pkl b/test/expect/ModelTester.test_max_vit_L_224_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..27777e11ee666c4ab536ab15699ab0f02af679b7 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@*M40YgPM2SYwzIirRtY8#@~=x=mcp1Em`=w`z&M>!0yJ!~};`Mi6Bd{d@ql;*Kv6QjPu#vM0|H(C8q-oz@zu~Fb=*T!1ajfP2`{|s49 zux#8Usbwg^Q@62ff|gP7W`3hL7m_yo-{iN6bL0GtyL=ZLYGrRSeBEhlB=Kyoq21;d zBWbJU8#TJ-ZEQYjVbmg$W|T4Qn85^TiA`Gn&uz#oXx!wK$GmZ&mhYxF2e;_w#;R^I zKReT4>*tRfL80}=qWIwnU`T;5?$F|Au!e_LS!z)+Fc#dL%!v#xq>zI!jk!QJU!0d7 z$^^6(gaf=8K@>bqBFCWsNCE|*r%-g=$bRBO(fJC4k zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK635C=-B9gR@TP~042-%&S&aUe-8Nu3qqE5=N_}H{=QN|nH(Z+v1dba@#HejZ z33l3a!TRNf*_XLDHur09GLczn*rIrC!y;L=O%aROjMiD-+IZl=Nqw`ZHyhb@Wf=Aw zp4)h2HTS0V-nAP`4$a-<B~~16iew%OCFbP;0?((Qrc){bnEbPL!bYZ zhJr6*3?HmaFr2VHXT!Cfy&Em0TMUb2eT`nUnHWy<*=4x@;GT{83*0u%+x%$bwA;Fd zC!QD^DM(&j4+^c9PnVjW0EQF@;|?u;25Wd|m8BLH17pF>$(+dGLJBzu)0hin^Tm1T zp-ezqK{&vh5k$e$Byt=IfFw`=dJ09?jqE2r6rHbtJY?Pa2IzW`UB!>0R|4olm|kcY z1bDNt=|C09G3&yWgAy|c!07D|F2f|SCqdq21LX|{PpAS=CJ69mWdn&Z10hH~L@fZ4 Cu=W@L literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_max_vit_T_224_expect.pkl b/test/expect/ModelTester.test_max_vit_T_224_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3db84d2e8e5fb079c737a84b6f8cc3dec9b07f4c GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66ZjklfFsC*-aB1nK#a0x@d4e=aNxFLiffSH8z_RLP|GDTz|RohM1<2-MqUS z8CEN;f1Ai#`nrNhZ-OA{`ve~AXx=KSC z9yKE#sjY^3J1%YHn9jG6E8oyCO=P-Z$DOi`#hX}+XC~@x`ovhd(d*NZjar!!hAbOx zH+jvBFuH$nx{=AF^&8GSwA@rs;9~ScTfj(U$CgbJ2i|U6qwKQr{9|vUP0wXF3Eg?P zadrWV(Nd-mBT#6qQhHW$0vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1 torch.Tensor: + coords = torch.stack(torch.meshgrid([torch.arange(height), torch.arange(width)])) + coords_flat = torch.flatten(coords, 1) + relative_coords = coords_flat[:, :, None] - coords_flat[:, None, :] + relative_coords = relative_coords.permute(1, 2, 0).contiguous() + relative_coords[:, :, 0] += height - 1 + relative_coords[:, :, 1] += width - 1 + relative_coords[:, :, 0] *= 2 * width - 1 + return relative_coords.sum(-1) + + +class GeluWrapper(nn.Module): + """ + Gelu wrapper to make it compatible with `ConvNormActivation2D` which passed inplace=True + to the activation function construction. + """ + + def __init__(self, **kwargs) -> None: + super().__init__() + self._op = F.gelu + + def forward(self, x: Tensor) -> Tensor: + return self._op(x) + + +class MBConv(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + expansion_ratio: float, + squeeze_ratio: float, + stride: int, + activation_fn: Callable[..., nn.Module], + normalization_fn: Callable[..., nn.Module], + ) -> None: + super().__init__() + + proj: Sequence[nn.Module] + self.proj: nn.Module + + should_proj = stride != 1 or in_channels != out_channels + if should_proj: + proj = [nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False)] + if stride == 2: + proj = [nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)] + proj # type: ignore + self.proj = nn.Sequential(*proj) + else: + self.proj = nn.Identity() # type: ignore + + mid_channels = int(out_channels * expansion_ratio) + sqz_channels = int(mid_channels * squeeze_ratio) + + _layers = OrderedDict() + _layers["pre_norm"] = normalization_fn(in_channels) + _layers["conv_a"] = Conv2dNormActivation( + in_channels, mid_channels, 1, 1, 0, activation_layer=activation_fn, norm_layer=normalization_fn + ) + _layers["conv_b"] = Conv2dNormActivation( + mid_channels, + mid_channels, + 3, + stride, + 1, + activation_layer=activation_fn, + norm_layer=normalization_fn, + groups=mid_channels, + ) + _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels) + _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=False) + + self.layers = nn.Sequential(_layers) + + def forward(self, x: Tensor) -> Tensor: + return self.layers(x) + self.proj(x) + + +class RelativePositionalMultiHeadAttention(nn.Module): + def __init__( + self, + feat_dim: int, + head_dim: int, + max_seq_len: int, + ) -> None: + super().__init__() + + if feat_dim % head_dim != 0: + raise ValueError(f"feat_dim: {feat_dim} must be divisible by head_dim: {head_dim}") + + self.n_heads = feat_dim // head_dim + self.head_dim = head_dim + self.size = int(math.sqrt(max_seq_len)) + self.max_seq_len = max_seq_len + + self.to_qkv = nn.Linear(feat_dim, self.n_heads * self.head_dim * 3) + self.scale_factor = feat_dim**-0.5 + + self.merge = nn.Linear(self.head_dim * self.n_heads, feat_dim) + self.positional_bias = nn.parameter.Parameter( + torch.zeros(((2 * self.size - 1) * (2 * self.size - 1), self.n_heads), dtype=torch.float32), + ) + + self.register_buffer("relative_position_index", get_relative_position_index(self.size, self.size)) + + # initialize with truncated normal the bias + self.positional_bias.data.normal_(mean=0, std=0.02) + + def _get_relative_positional_bias(self) -> torch.Tensor: + bias_index = self.relative_position_index.view(-1) # type: ignore + relative_bias = self.positional_bias[bias_index].view(self.max_seq_len, self.max_seq_len, -1) # type: ignore + relative_bias = relative_bias.permute(2, 0, 1).contiguous() + return relative_bias.unsqueeze(0) + + def forward(self, x: Tensor) -> Tensor: + # X, Y and stand for X-axis group dim, Y-axis group dim + B, G, P, D = x.shape + H, DH = self.n_heads, self.head_dim + + qkv = self.to_qkv(x) + q, k, v = torch.chunk(qkv, 3, dim=-1) + + q = q.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4) + k = k.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4) + v = v.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4) + + k = k * self.scale_factor + # X, Y and stand for X-axis group dim, Y-axis group dim + dot_prod = torch.einsum("B G H I D, B G H J D -> B G H I J", q, k) + pos_bias = self._get_relative_positional_bias() + + dot_prod = F.softmax(dot_prod + pos_bias, dim=-1) + + out = torch.einsum("B G H I J, B G H J D -> B G H I D", dot_prod, v) + out = out.permute(0, 1, 3, 2, 4).reshape(B, G, P, D) + + out = self.merge(out) + return out + + +class SwapAxes(nn.Module): + def __init__(self, a: int, b: int) -> None: + super().__init__() + self.a = a + self.b = b + + def forward(self, x: torch.Tensor) -> torch.Tensor: + res = torch.swapaxes(x, self.a, self.b) + return res + + +class WindowPartition(nn.Module): + """ + Function that takes in a tensor of shape [B, C, H, W] and partitions it + in to a tensor of shape [B, H/P, W/P, P*P, C] + """ + + def __init__(self, partition_size: int) -> None: + super().__init__() + self.partition_size = partition_size + + def forward(self, x: Tensor) -> Tensor: + B, C, H, W = x.shape + P = self.partition_size + # chunk up H and W dimensions + x = x.reshape(B, C, H // P, P, W // P, P) + x = x.permute(0, 2, 4, 3, 5, 1) + # colapse P * P dimension + x = x.reshape(B, (H // P) * (W // P), P * P, C) + return x + + +class WindowDepartition(nn.Module): + """ + Function that takes in a tensor of shape [B, H/P, W/P, P*P, C] + and partitions it into a tensor of shape [B, C, H, W] + """ + + def __init__(self, partition_size: int, n_partitions: int) -> None: + super().__init__() + self.partition_size = partition_size + self.n_partitions = n_partitions + + def forward(self, x: Tensor) -> Tensor: + B, G, PP, C = x.shape + P = self.partition_size + HP, WP = self.n_partitions, self.n_partitions + # split P * P dimension into 2 P tile dimensionsa + x = x.reshape(B, HP, WP, P, P, C) + # permute into B, C, HP, P, WP, P + x = x.permute(0, 5, 1, 3, 2, 4) + # reshape into B, C, H, W + x = x.reshape(B, C, HP * P, WP * P) + return x + + +class MLP(nn.Module): + def __init__( + self, + in_dim: int, + hidden_dim: int, + activation_fn: Callable[..., nn.Module], + normalization_fn: Callable[..., nn.Module], + dropout: float, + ) -> None: + super().__init__() + self.in_dim = in_dim + self.hidden_dim = hidden_dim + self.activation_fn = activation_fn + self.normalization_fn = normalization_fn + self.dropout = dropout + + self.layers = nn.Sequential( + self.normalization_fn(in_dim), + nn.Linear(in_dim, hidden_dim), + self.activation_fn(), + nn.Linear(hidden_dim, in_dim), + nn.Dropout(dropout), + ) + + def forward(self, x: Tensor) -> Tensor: + return x + self.layers(x) + + +class PartitionAttentionLayer(nn.Module): + def __init__( + self, + in_channels: int, + head_dim: int, + # partitioning parameteres + partition_size: int, + partition_type: str, + # grid size needs to be known at initialization time + # because we need to know hamy relative offsets there are in the grid + grid_size: Tuple[int, int], + mlp_ratio: int, + activation_fn: Callable[..., nn.Module], + normalization_fn: Callable[..., nn.Module], + attn_dropout: float, + mlp_dropout: float, + ) -> None: + super().__init__() + + self.n_heads = in_channels // head_dim + self.head_dim = head_dim + self.n_partitions = grid_size[0] // partition_size + self.partition_type = partition_type + + if partition_type not in ["grid", "window"]: + raise ValueError("partition_type must be either 'grid' or 'window'") + + if partition_type == "window": + p, g = partition_size, self.n_partitions + else: + p, g = self.n_partitions, partition_size + + partition_op = [WindowPartition(p)] + departition_op = [WindowDepartition(p, g)] + + if partition_type == "grid": + partition_op = partition_op + [SwapAxes(-2, -3)] # type: ignore + departition_op = [SwapAxes(-2, -3)] + departition_op # type: ignore + + self.partition_op = nn.Sequential(*partition_op) + self.departition_op = nn.Sequential(*departition_op) + + self.attn_layer = nn.Sequential( + normalization_fn(in_channels), + # it's always going to be partition_size ** 2 because + # of the axis swap in the case of grid partitioning + RelativePositionalMultiHeadAttention(in_channels, head_dim, partition_size**2), + nn.Dropout(attn_dropout), + ) + + self.mlp_layer = MLP(in_channels, in_channels * mlp_ratio, activation_fn, normalization_fn, mlp_dropout) + + # layer scale factors + self.attn_layer_scale = nn.parameter.Parameter(torch.ones(in_channels) * 1e-6) + self.mlp_layer_scale = nn.parameter.Parameter(torch.ones(in_channels) * 1e-6) + + def forward(self, x: Tensor) -> Tensor: + x = self.partition_op(x) + x = self.attn_layer(x) * self.attn_layer_scale + x = self.mlp_layer(x) * self.mlp_layer_scale + x = self.departition_op(x) + return x + + +class MaxVitLayer(nn.Module): + def __init__( + self, + # conv parameters + in_channels: int, + out_channels: int, + squeeze_ratio: float, + expansion_ratio: float, + stride: int, + # conv + transformer parameters + normalization_fn: Callable[..., nn.Module], + activation_fn: Callable[..., nn.Module], + # transformer parameters + head_dim: int, + mlp_ratio: int, + mlp_dropout: float, + attn_dropout: float, + # partitioning parameters + partition_size: int, + grid_size: Tuple[int, int], + ) -> None: + super().__init__() + + layers: OrderedDict[str, Any] = OrderedDict() # type: ignore + + # convolutional layer + layers["MBconv"] = MBConv( + in_channels=in_channels, + out_channels=out_channels, + expansion_ratio=expansion_ratio, + squeeze_ratio=squeeze_ratio, + stride=stride, + activation_fn=activation_fn, + normalization_fn=normalization_fn, + ) + # attention layers, block -> grid + layers["window_attention"] = PartitionAttentionLayer( + in_channels=out_channels, + head_dim=head_dim, + partition_size=partition_size, + partition_type="window", + grid_size=grid_size, + mlp_ratio=mlp_ratio, + activation_fn=activation_fn, + normalization_fn=nn.LayerNorm, + attn_dropout=attn_dropout, + mlp_dropout=mlp_dropout, + ) + layers["grid_attention"] = PartitionAttentionLayer( + in_channels=out_channels, + head_dim=head_dim, + partition_size=partition_size, + partition_type="grid", + grid_size=grid_size, + mlp_ratio=mlp_ratio, + activation_fn=activation_fn, + normalization_fn=nn.LayerNorm, + attn_dropout=attn_dropout, + mlp_dropout=mlp_dropout, + ) + self.layers = nn.Sequential(layers) + + def forward(self, x: Tensor) -> Tensor: + return self.layers(x) + + +class MaxVitBlock(nn.Module): + def __init__( + self, + # conv parameters + in_channels: int, + out_channels: int, + squeeze_ratio: float, + expansion_ratio: float, + # conv + transformer parameters + normalization_fn: Callable[..., nn.Module], + activation_fn: Callable[..., nn.Module], + # transformer parameters + head_dim: int, + mlp_ratio: int, + mlp_dropout: float, + attn_dropout: float, + # partitioning parameters + partition_size: int, + input_grid_size: Tuple[int, int], + # number of layers + n_layers: int, + p_stochastic: List[float], + ) -> None: + super().__init__() + assert ( + len(p_stochastic) == n_layers + ), f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}." + + self.layers = nn.ModuleList() + # account for the first stride of the first layer + self.grid_size = (input_grid_size[0] // 2, input_grid_size[1] // 2) + + for idx, p in enumerate(p_stochastic): + stride = 2 if idx == 0 else 1 + self.layers += [ + MaxVitLayer( + in_channels=in_channels if idx == 0 else out_channels, + out_channels=out_channels, + squeeze_ratio=squeeze_ratio, + expansion_ratio=expansion_ratio, + stride=stride, + normalization_fn=normalization_fn, + activation_fn=activation_fn, + head_dim=head_dim, + mlp_ratio=mlp_ratio, + mlp_dropout=mlp_dropout, + attn_dropout=attn_dropout, + partition_size=partition_size, + grid_size=self.grid_size, + ), + StochasticDepth(p, mode="row"), + ] + + def forward(self, x: Tensor) -> Tensor: + for layer in self.layers: + x = layer(x) + return x + + +class MaxVit(nn.Module): + def __init__( + self, + # stem and task parameters + input_channels: int, + stem_channels: int, + input_size: Tuple[int, int], + out_classes: int, + # block parameters + block_channels: List[int], + block_layers: List[int], + stochastic_depth_prob: float, + # conv parameters + squeeze_ratio: float, + expansion_ratio: float, + # conv + transformer parameters + # normalization_fn is applied only to the conv layers + # activation_fn is applied both to conv and transformer layers + normalization_fn: Callable[..., nn.Module], + activation_fn: Callable[..., nn.Module], + # transformer parameters + head_dim: int, + mlp_ratio: int, + mlp_dropout: float, + attn_dropout: float, + # partitioning parameters + partition_size: int, + ) -> None: + super().__init__() + + # stem + self.stem = nn.Sequential( + Conv2dNormActivation( + input_channels, stem_channels, 3, stride=2, norm_layer=None, activation_layer=None, bias=True + ), + Conv2dNormActivation( + stem_channels, stem_channels, 3, stride=1, norm_layer=None, activation_layer=None, bias=True + ), + ) + + # account for stem stride + input_size = (input_size[0] // 2, input_size[1] // 2) + + # blocks + self.blocks = nn.ModuleList() + in_channels = [stem_channels] + block_channels[:-1] + out_channels = block_channels + + # precompute the stochastich depth probabilities from 0 to stochastic_depth_prob + # since we have N blocks with L layers, we will have N * L probabilities uniformly distributed + # over the range [0, stochastic_depth_prob] + p_stochastic = np.linspace(0, stochastic_depth_prob, num=sum(block_layers)).tolist() + + p_idx = 0 + for in_channel, out_channel, num_layers in zip(in_channels, out_channels, block_layers): + self.blocks.append( + MaxVitBlock( + in_channels=in_channel, + out_channels=out_channel, + squeeze_ratio=squeeze_ratio, + expansion_ratio=expansion_ratio, + normalization_fn=normalization_fn, + activation_fn=activation_fn, + head_dim=head_dim, + mlp_ratio=mlp_ratio, + mlp_dropout=mlp_dropout, + attn_dropout=attn_dropout, + partition_size=partition_size, + input_grid_size=input_size, + n_layers=num_layers, + p_stochastic=p_stochastic[p_idx : p_idx + num_layers], + ), + ) + input_size = self.blocks[-1].grid_size + p_idx += num_layers + + self.classifier = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Flatten(), + nn.Linear(block_channels[-1], out_classes, bias=False), + ) + + def forward(self, x: Tensor) -> Tensor: + x = self.stem(x) + for block in self.blocks: + x = block(x) + x = self.classifier(x) + return x + + +def max_vit_T_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, + stem_channels=64, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[64, 128, 256, 512], + block_layers=[2, 2, 5, 2], + stochastic_depth_prob=0.2, + squeeze_ratio=0.25, + expansion_ratio=4.0, + normalization_fn=nn.BatchNorm2d, + activation_fn=GeluWrapper, + head_dim=32, + mlp_ratio=2, + mlp_dropout=0.0, + attn_dropout=0.0, + partition_size=7, + ) + + +def max_vit_S_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, + stem_channels=64, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[96, 192, 384, 768], + block_layers=[2, 2, 5, 2], + stochastic_depth_prob=0.3, + squeeze_ratio=0.25, + expansion_ratio=4.0, + normalization_fn=nn.BatchNorm2d, + activation_fn=GeluWrapper, + head_dim=32, + mlp_ratio=2, + mlp_dropout=0.0, + attn_dropout=0.0, + partition_size=7, + ) + + +def max_vit_B_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, + stem_channels=64, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[96, 192, 384, 768], + block_layers=[2, 6, 14, 2], + stochastic_depth_prob=0.4, + squeeze_ratio=0.25, + expansion_ratio=4.0, + normalization_fn=nn.BatchNorm2d, + activation_fn=GeluWrapper, + head_dim=32, + mlp_ratio=2, + mlp_dropout=0.0, + attn_dropout=0.0, + partition_size=7, + ) + + +def max_vit_L_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, + stem_channels=128, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[128, 256, 512, 1024], + block_layers=[2, 6, 14, 2], + stochastic_depth_prob=0.6, + squeeze_ratio=0.25, + expansion_ratio=4.0, + normalization_fn=nn.BatchNorm2d, + activation_fn=GeluWrapper, + head_dim=32, + mlp_ratio=2, + mlp_dropout=0.0, + attn_dropout=0.0, + partition_size=7, + ) From c5b28398cd48d2f3403c7c8eeefbaba9df05fcfe Mon Sep 17 00:00:00 2001 From: Ponku Date: Fri, 5 Aug 2022 18:16:12 +0100 Subject: [PATCH 02/23] rebased + addresed comments --- .../ModelTester.test_alexnet_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_convnext_base_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_convnext_large_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_convnext_small_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_convnext_tiny_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_densenet121_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_densenet161_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_densenet169_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_densenet201_expect.pkl | Bin 543 -> 939 bytes ...odelTester.test_efficientnet_b0_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b1_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b2_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b3_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b4_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b5_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b6_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b7_expect.pkl | Bin 939 -> 939 bytes ...elTester.test_efficientnet_v2_l_expect.pkl | Bin 939 -> 939 bytes ...elTester.test_efficientnet_v2_m_expect.pkl | Bin 939 -> 939 bytes ...elTester.test_efficientnet_v2_s_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_inception_v3_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_maxvit_t_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_mnasnet0_5_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_mnasnet0_75_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_mnasnet1_0_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_mnasnet1_3_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_mobilenet_v2_expect.pkl | Bin 543 -> 939 bytes ...lTester.test_mobilenet_v3_large_expect.pkl | Bin 953 -> 939 bytes ...lTester.test_mobilenet_v3_small_expect.pkl | Bin 953 -> 939 bytes .../ModelTester.test_regnet_x_16gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_1_6gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_x_32gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_3_2gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_400mf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_800mf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_x_8gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_y_16gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_1_6gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_y_32gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_3_2gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_400mf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_800mf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_y_8gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_resnet101_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_resnet152_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_resnet18_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_resnet34_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_resnet50_expect.pkl | Bin 543 -> 939 bytes ...delTester.test_resnext101_32x8d_expect.pkl | Bin 939 -> 939 bytes ...delTester.test_resnext101_64x4d_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_resnext50_32x4d_expect.pkl | Bin 543 -> 939 bytes ...lTester.test_shufflenet_v2_x0_5_expect.pkl | Bin 543 -> 939 bytes ...lTester.test_shufflenet_v2_x1_0_expect.pkl | Bin 543 -> 939 bytes ...lTester.test_shufflenet_v2_x1_5_expect.pkl | Bin 543 -> 939 bytes ...lTester.test_shufflenet_v2_x2_0_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_squeezenet1_0_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_squeezenet1_1_expect.pkl | Bin 543 -> 939 bytes .../expect/ModelTester.test_swin_b_expect.pkl | Bin 939 -> 939 bytes .../expect/ModelTester.test_swin_s_expect.pkl | Bin 939 -> 939 bytes .../expect/ModelTester.test_swin_t_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_vgg11_bn_expect.pkl | Bin 543 -> 939 bytes test/expect/ModelTester.test_vgg11_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_vgg13_bn_expect.pkl | Bin 543 -> 939 bytes test/expect/ModelTester.test_vgg13_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_vgg16_bn_expect.pkl | Bin 543 -> 939 bytes test/expect/ModelTester.test_vgg16_expect.pkl | Bin 543 -> 939 bytes .../ModelTester.test_vgg19_bn_expect.pkl | Bin 543 -> 939 bytes test/expect/ModelTester.test_vgg19_expect.pkl | Bin 543 -> 939 bytes ...delTester.test_wide_resnet101_2_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_wide_resnet50_2_expect.pkl | Bin 543 -> 939 bytes test/test_models.py | 8 - torchvision/models/__init__.py | 2 +- torchvision/models/maxvit.py | 233 ++++++++---------- 73 files changed, 106 insertions(+), 137 deletions(-) create mode 100644 test/expect/ModelTester.test_maxvit_t_expect.pkl diff --git a/test/expect/ModelTester.test_alexnet_expect.pkl b/test/expect/ModelTester.test_alexnet_expect.pkl index 5408e33378bc56da2491bf6a4f6816899366255b..1e3dcc17044ddcb1b344ac5c9363a13ac5d991d4 100644 GIT binary patch delta 230 zcmV(anmoLb z7OcFaWLvtB28KAKgblo7_#(W(j1D{!PBFZ13+%gS>~}jcS$aF1s4Kk8?jk&J(4aYp z=p{QZJtMpk7>+!&rI0$P>BT$uu4p_{o{GH81Nysm`lq{85I9l1QW?a%@fy@SRVdHA z%byaw8XG0MJ{kVIVMaqdU1+?!0OhE(cnmoLd z7OcFmWLvr*1BN)Lgblo6_#(W(j1D{zPBFZC3+%gM>~}jbS$aE~s4Kk7?jk&Z(4aY$ z=p{QYJtMpk7>+!^rI0$U>BT$qu4p_|o{GG|1Nysr`lq{45I9l1QW?a%?;6xQRw&QB z%byaw6dNVGL>d0OVMaqdT4=nx2<51{kY6ae{<)<)h=*#s5Lp4bh$1(;T*ME&ek2RL g8BJ9>9AJPvP)i30W;9!PHC9Gpz`%S2PLY%kUYV?ubyu|q)YU?3+^Dl4Stip_=^HR99!_c gM*s=GxSOV diff --git a/test/expect/ModelTester.test_convnext_large_expect.pkl b/test/expect/ModelTester.test_convnext_large_expect.pkl index 94b6ddaaa854996888a671b012576974d2a7d919..270be66bc58e84a6e4038146833ea0d544d6db57 100644 GIT binary patch delta 230 zcmVnUou7|%KE;K*% z#7aNZZ3w?=W*9)g3-CQH((*na;m5p`-l4sSYvjKx)gM2AP8hqt<59nvDuF*BoPs`= z(Wt*PfQ~;Yo58;v71chviKD+p99lov!5cmz6b`@pg_A$<4%tyXrZE7&SCnc$nzgOJ z%C=ZO(wi+m!0)m=X^XbMUO@vtBm~1gQ$Pqn2r^{~$lI{+7Q6K2N_Mt|PyDmL0w+pJ6|GCE-5Mye_@KJ?Fi4 z)h@q%X`H_WVBJ0)9m+j&o6bJ_&i@=v_U^tt`I~GrY?!`1jSUb!^_Z1D zu+XVKmM6kK7Cgj0OqWML7_L6PPmUD7ilL@Hj$*dI$*D#^0HXrC0B~hHd}9o~Jl2jr cUg&E+P)i30Ms^sMlMn*X1V(lkmXqWH$DeU(oB#j- delta 226 zcmV<803H9U2df9LL;-)kubIE+eHg#A#DcwXH^n}tbw9t94KhF1)8#(c(M-Nh=^8!) zMw!2D!<{}{vAMo?rDZ>{{~$lK{+7Q5K2N_Mt|PyOmL0w-pJ6|ECE-4xye_@8J?Fi2 z)h@q)X`H|PU)??;9m+j$o6bJ{&ij=v_U>tt`J1GrZv>rd^hz!38A1A)>t-wCU|0g~)>K47nJ1;&XWnsVcXD&Z1 z4=%oyRSG|#Qx3lPmbJai;Ws~!RpUOy+Zw*OnN>fKr`0`piG95mCvU%f$~wQyrKY~! zP2WBe;ZQy}XN*40`+&cs4o*J@pn*Svs3kuB)(<~=hJn73k%Cdbt>)^z?3?qw(%c}v zNR=o**`-3iMmKc66nf7+US>i+CO#!TnBIoI$sn`7&S2j@*uMrd_hz!35A1A)+t-wCb|0g~$>K47gJ1;&WWnsVcXD&ZB z4=%o#RSG|-Qx3i+m$kjg;Ws~!RpUOq+Zw*PnN>e}r`0{UiG95qCvU%e$~wQ&rKY~) zP2WBM;ZQy}XN*40`+&cq4o*J@pn*Srs3kuB)(<~t<8 diff --git a/test/expect/ModelTester.test_densenet121_expect.pkl b/test/expect/ModelTester.test_densenet121_expect.pkl index 83bcc3282fba377873cb140cc35a59beeee5072c..74d5ab0b81afff14213145b9e1443a974449bccf 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5-09smtBN%#lD{XfqS>^*uK|GYP)USHX*x3OXTg(lwGk4GI84XAo#qUo>ck1 zHQ(3nGyZyN-^A*xc3s==@B4N8!rqc*hP@$${kBnWD)&C{S+Va4o6_DcO|^Y3llk_2 z_#ABKyWeb|$a@Pr({)Goo}10K@0#k~eG*gcRG+2ed<;ok7oT6U`DKWuLZ z=i2S#p15!M+$OuIXU+Rwb*S2%;oGz?`gYR3A8L>H{yOn^U)f_@yPrOH?7FpG?Vj9} z*jF#xvTqh|*gjBbHJsYD0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H z!yv$$jZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y D3lRaJ delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0%|$1b}V<%)eL_6P30xMTa? zRZ`n+=WY|S+qXpC{z=&ty9^Vjee5CU?fj(5_Z|Aac3;%jQ~P#TU$tAh{r*1PI~Vpg zG&Ah2FYLEH@uqU`8=n>X9N3liPSaG|w`+19-#(Qu!FD10&GrSnx3FtkcVzFa*=+mX ztM1)*<9ML$ylxe{36dgv{EsZ$Td-Qo&eQyd?Kk0EyKCGN_idTmWLNa8c^_Y=s@)I1 sP5UZtC++i5f4o=e zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66e3p}L7TK*7tGAo**Uom?`I~z$y*_W3w8P6*rCnuT#QW7YCEc9+ zt~?FgpYVWbKPa^R2u!$m0vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5Pl*URj5UHa?__te_$c^0=X zY%9aQU7l<9Sw$$=T@qWdZ&~M~y+=ZdZ4d4#*(Y|E&n{oscmI_YuWSQmf3$gbuzg>z zO0Au@K&M^8y?r+K8$0cOmm1nNWpeD>b|LSM-Ms_8`#i;J_BypS@2hz>#ZKto=6zx= zZMJuKne0j?X4_v04zRO`PqRJqnsYx_#%Vh%F^zrerg!culMvt6BDi4R&9p^!kHqTj r?)A=xZs}_ diff --git a/test/expect/ModelTester.test_densenet169_expect.pkl b/test/expect/ModelTester.test_densenet169_expect.pkl index 42743fb0c80fcdec19554b350d2b91149a56bdae..b7df8aaf90150b85834b32f08814c6bed7618b27 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=YMU?!JRdh3$<_s_avWvEJumvBmC;{Rul)p6&Z&R$A|yM-a&S3InV6YaFpyj zcKxQ^olsG`*~+Z@ef-$$9ZxUV_eXi&zNk%>_W#`V?ccOX?f2!Ev8$bJXwNj`$iAK4 zi}y7<+u2)N&fT}*=*+zr^e5W8ho{-|+oyNVx0uLRJAFul+) z2=HcO(}60IW7dT$2PI|@fYI9_T!u+tPlCM52Fe=@o=^p#Oc3DB$_5f+211Z}h*|*W CRr8ww delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32c>)m~~mI~X4omAQ95o5hC z#$t<|fWry90-o*rQde5<+u_z|_sDMAK8ACAcK6S1+^4E>Z121pHd~FoPW!g1PuZ8l zvU2Z&-ShWN>{+-sBvsrt$tTE8Hs_fg17}{zzAx8r+Np<$+HFu~-Jk5oW^Z(Q!9GEi zdHaetS=y_3=-dBpliF{}FJrfAwxPY;j3fIFc`x3#&e_gB&2sL(cSmRLeXT#yz9>A+ r-q=Qc|5JazeK{S``>Xz{*~h%&*yqYM)9%xQJ^OO zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=TjI{XW~-XZBfFzS;Y?AbDRqXiuuWg=&+nnscDt#y`yaw`6wMu8YpH>zeV?mg~x0 z+gCo5_RUG(YqzI?abLZCr@hw}(S4T|%(abM{bS$lD)IfhZS41@T+g)?ww|z0s_pf@ z2QtfT4J;S#1BKSB#CY`+z>orA+@ZzKU=0tgvecqtU@W*fnG+dYNFfJd8gqeczBn&E zlnH1n2nTpGf+%>JM2-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32g-uitxv(M}csC={6uP}LE zMXRcP!-J(ZMf%bEwd@)86>gQbcXDL4d$ek+-J1M&`>yGP+lxH=zPEE`{=USBWZM_2 z6Yaz{KerW%oN4!FNrv6IHVHew<*)6E_hgX z+HX*8vek1Aw7c|&dH=4=4%?s6S#~)ye%j8ta@Us0chbJY>3i+YH!$v-W#4IEutjuV q?}E9u^H=}a_n}IB|9%_$eMQ%EZFQ_C?2Br9z3-OHa$9f9h5G`f|e52~j?ZMrEMQd6pm>)`f{_4yvc`Y`Q4`Qd6q3>)c!!QD*YE%;hw;{_ExaeH*ySmzbUrMqzRPv0+ff0j gL>IECEbHv3P)i30F~yPIlMn*X1Tn>t-jn14#}|)o3jhEB diff --git a/test/expect/ModelTester.test_efficientnet_b1_expect.pkl b/test/expect/ModelTester.test_efficientnet_b1_expect.pkl index 1499a97028eef11527829f13476152d00d8cb90b..53e1e9bcb668db68b4d10f7bf274f97d5ea38e04 100644 GIT binary patch delta 230 zcmVt3iYGx;eAYJ{q|cuXqjE{!OqZYU~Y zdUh%D4Unmh$-b$k-NPxUni;49M2;zNE#)br^h+sBjJv3lt1wZjk?ZOy9Afq<+c(lF znleo)3w=bY85n&j7?Y)_0w>-mQAcK~>`);pH0O;f+0de?Li|UrgGx;eHYJ{q=cuXqFE{!OrZYU~1 zdUh!S4v?vx$-b$%-NPwxn;EDAM2;z9E#)bs^h+sajJv3lt1wZjsO#z}DPs01&^OX4 zr7}$_4t+$b78rdgev_rB7AM{)QAcK~=ujalGUtsd)zG4;F#Ad=szj2i`3Pnz&|+yS gmoh9W8Nz(3P)i30FFa$plMn*X1TQ>exs&7q$3yaDLI3~& diff --git a/test/expect/ModelTester.test_efficientnet_b2_expect.pkl b/test/expect/ModelTester.test_efficientnet_b2_expect.pkl index f0aeb8ec122d5a350052ad5788918607cfb0cf91..eb755eb6dadf3fb784b684ca724c575db038268e 100644 GIT binary patch delta 230 zcmVYCz0(hz@*`X&o;BzVTs&6Um zx$vpQ-##j8WzMP2fhek^2?i=TdFZIB-A$=9gEA_Al!2+&a>%K8lbNV;*!w8z3_B|N z1*)l;AqFa~$Kxq}{3|LTQ75M$u)C;8aKR|T()y_)=_4v}0}WBC!Y_!aBp!{a!mjA3 zK9#*G?S(Ft1Z@Z^KxBt0NR|JnPlp^TH>LtA gTa}8aF76mAP)i30ULvlglMn*X1YRPprIX|W$J|Y5-T(jq delta 230 zcmVYCy0(h$6*r6vz;BzSut8Xb0 zy6~yM-##i~WzMO}fhek@2?i=2dFZH$-A$=PgEA_El!2+#a>%JHlbNWD*!w8<3_B|R z1*)l@AqFbJ$Kxr8{3|M;Qzxe#u)C<-aKR|P()y_w=_4w50}WBC*e{5w5gv`HxUT4^ zK$X2I>V?~>R1HKZ&<6LZqKg+Q46TnS4Sx)(m&uB$0c{8>L}Z64YnA_~SBD%bJf;FF gSCxvWE$$d9P)i30a_5~!lMn*X1ajw{Mw8?M$C%M+fdBvi diff --git a/test/expect/ModelTester.test_efficientnet_b3_expect.pkl b/test/expect/ModelTester.test_efficientnet_b3_expect.pkl index 989d6782fe799c4833239c51f08e8375a6592179..0334adc46316962a0bc47fff035e5ce0cacb8050 100644 GIT binary patch delta 230 zcmV{oSTLmdB%wDMMNW`fx<1;Dzb)>0#1{|sgXeX+D z@1Lj}43#PON0Oi8sjHa+{UQ@bg?CVlaVPe6dEayEFh^4=1r;e zzgnqKKn*8Mpqi;-=2a=c4-lznG)t(&ekDtr*3;GAovER z=;el~jxb}W@`dgxk!>2Ova`UcttAC1mLj;QNkhM=7$sIH{oLTLmfP%U-DVNW`fu<1;Dob)>0y1{|siXeX+G z@1Lj@43#PPN0Og9&1$S`B5_J!^#cx@V~ud~3ZswD*}(jvI1Fhjqn^CVU%;Q#X}N5j`C+k}Uyz)Y#8 ga`w0=miVKoP)i306e?e_lMn*X1QaS?v6JKi$F9_DE&u=k diff --git a/test/expect/ModelTester.test_efficientnet_b4_expect.pkl b/test/expect/ModelTester.test_efficientnet_b4_expect.pkl index f4a0cc04bf0ff3eee9f110f43a4c39a530c51f5e..2720de96976812ce2c02988fb09219a653064711 100644 GIT binary patch delta 230 zcmV;pb#N#8tBrq*<* gMiq&t=4U7JTV$g4rix?1`yt;MS-D@|#g8V{?V5_laOAM>Eu@ zAoE2hJ$A|{jtB6lK6XHlp*7sn`4FUu&8^aZJ;Q~;gUg=P)i30ENFdolMn*X1T1KMb(7=*$6KdpbN~PV diff --git a/test/expect/ModelTester.test_efficientnet_b6_expect.pkl b/test/expect/ModelTester.test_efficientnet_b6_expect.pkl index 34dbdd074f187b71d5c37ed063958812e8c49425..4d2e9e6d0e3a785caf0fa914092bdbe26aa9aa1f 100644 GIT binary patch delta 230 zcmVQ5)c@ZcvdLiZ;kTCXRgk>aPimq(|E zyE!LYtPdy7?F%RFQn#nH)qW-JNFAqmxJV{I34f%>qxC12x}Ybl_%)`gfq|zy6nH0Y zK~E>t>Aa>1Xuu}SS%{}V65FTjx=E&X?(ZkJCfFv_E@viEYS&Sx^N9tgh^d{X-Q5)K@ZcvnLiZ;eTCXRok>aPumq({% zyE!LKtPdx=?F%R2Qn#m|)_x@pNgb!Ww@4;v6Mv+@qxC1Wx}YaS_%)`ufq|zD6nH0q zK~E>_>Aa@&XTT=ZS%{}B65FTfx=E&P?(Zk{CfFvBE@vi7YS&SxC5i>7eyN?N-I)fb zf@w=9q{5yjAhs*0)L$D6ilZ2$lO diff --git a/test/expect/ModelTester.test_efficientnet_b7_expect.pkl b/test/expect/ModelTester.test_efficientnet_b7_expect.pkl index a4b42e8017850fe50831f55dee98d9c27cc52f50..1b12e4fa1aa1a3e270d8515b1e6beb5663d79b9e 100644 GIT binary patch delta 230 zcmVbYrHlv4|(8A6=(5Q}L!C$p5D0*^;M8ftaT}Vy~rx z{wk)SOeZHrZ%n2j%T^_Sa2zMg{8gvE&d{b(24W@vZk4Br*b^sFAtNWK8|@{QfXJr` ze9Na2Jb@>-2%o1lPHU&JJNPCz?d>Kou;`}9?4hQgj-{qq2IWzwU?rTR9QsQq{-@lg zOdwAu^j@$gyInM=ZXWuk{yIk{$YZ!AbkYVVkF%C01nCkd(?GeW*I1k<%sUGvnUx18 gp6ddqkPxP%P)i30p8o|RlMn*X1fKr|B9r6-$6Oq0@Bjb+ delta 230 zcmVa2zMh{8gv0&d{ce24W`qZI!21*b^sDAtNWm9PK5NfXJr= ze9NZ}Jb@?72%o1OPHU&EJNPDU?d>Kcu;`|i?4hQFj-{qZ2IWzwfFPWs9QsQq4XE6v zVIWT@9$&B~(_J*DmLB@1%sNLVeqy*KjM4@tUbB`a@8}XI&_KDT+E|nv61l(wi~5l~}W37o{_lMx`{6 ztco)l9WFE~ykN+Yz&hs5UdxDVsCFJOwmP zW4kjf{wg#|6Ztcv{FXDqVLddeVmLE6HoP)bO4u_1Cf2l;|EN*4HVvmUZk&%ZMp-vA z>EX(=9NTj=X4;@Mexro6bbF07tq5JTaPn2N5K(`$kH)Ap)mECc2n&d^zGs9qe9iVV g#t8m1P!KS)P)i30@V6W7lMn*X1n{>T?UUpJ$C!p^ng9R* delta 230 zcmVvXaJ+!3?*r#3UHDVsCwJOwnQ zW4kkW{wg#G6ZtbE{gyMyVLddoV>mP3HoP)DO4u_hCf2ml|EN*49u226ZJduYx>+|f zx#7yQ65De$UfQ5EgrkJCd3%jC#t2=srSes?3{iiyqsFK-+*X>jdJBlM)n|k>Zq4>H g#0dT~J`pgpP)i30htfvjlMn*X1c%Z_;gjS7$Ln@#DgXcg diff --git a/test/expect/ModelTester.test_efficientnet_v2_m_expect.pkl b/test/expect/ModelTester.test_efficientnet_v2_m_expect.pkl index 622e2458e669228e45cb6e41c915fb81b4786ff2..a2d307023f18dc8419d5e181417c3702c977a252 100644 GIT binary patch delta 230 zcmVIM%UhlEE?dnnE&mqld9BUzo8_ zS^6<-*|;%?q7Sf&439B}5sffNjF_>Q@}jXg^eQm$17k7pcCN304EC@|2$ZqYsx7jY zc6TyvnuoFY8_h5rOZ2eq#tbqL=UcK}eQGiOX$G?C`E9Xqf8J3reb4$aNYO_v3-$ONIWcp0Ry gx3MR(CWT+IP)i30M@NwwlMn*X1V=}a8k6J#$HHG}rT_o{ delta 230 zcmVe@}jX?^(ruP17k5@c&@L{4EC^42$Zp>sx7i^ zc6Tz4nuoCj9L+EkOZ2b+#|$zv=UcKEeQGi4X$G=5`fagrf8J3raL@WNeLyF&x)b;^ zi+7(G$3I^~ZtFc|MJT$cs1S_Gl73L2!a gu(2ny;DleXP)i30Wq&gXlMn*X1Z96S36tak$E`?cRsaA1 diff --git a/test/expect/ModelTester.test_efficientnet_v2_s_expect.pkl b/test/expect/ModelTester.test_efficientnet_v2_s_expect.pkl index ef798e3a0b03656dc7183ef0f9a36ef4d38bae13..98f4ffb1ae1e549f6575a30dcd4c9416148bc1ef 100644 GIT binary patch delta 230 zcmVFTpiP?ESS!M^`map}Mmq0=BmFWqUSoo;@|YDyuY%&<3?5A96Ou zX(_h05vsLriF>yFUir1rwl_9}`+PJm)oeC!%DJ`hqj@xXa8psWTAm!X#BM3IL-%er zHB(eI;2Jy#Bu0SF+%Hni3sF=<*FM=Vp gB13kyPK-dcP)i30eRsJ;lMn*X1buh8M3dwK$751qmjD0& delta 230 zcmV#+5 zoI5t!NnW*J@&>kyUcxoLKmfMFp;|S-K(w<0bGWty(tb9>b3rwb`K&bIwg|Nu>2x+2 z=PI_CcC59vPJgz$b^Wy=qd7M6cz`s59dI^L@4L0k?13~3Mpsd_^dTO$suV1>Rm^cV zoS0ZOkq<#NK@Fia!_bg6sdb<=*1|Y8)a>3hOgud{u*n)WC^98A&_sAP3XIV;8XzOJ ghxT~2SpP${P)i301}%0}#^+LA|E$p)X#CM2>*~?`-!jrzaurdmL8nnr z#(>hOt<2G8Zvj!ljQmm1Hd{|X2p>-X)P_=rf6QxqxTw1ZL+LIl!K0m{*AB7sr@el*hO0Kif!P(D&EsWeejGY9I?s%nB#qUO)g z^AxI5PF=at9MOT&7xc+dSh%IoP{|HarboNcpsx~A*6|wA9H-FI)*P`>isuhe#J8-` g@dPMSbfj3)P)i30(dy%LlMn*X1kvi_bd%%)$2LoF3IG5A delta 230 zcmV}%0)#^+LG|E$pqX#CL@>*~?`-!jr{aurdYL8nnj z#(>g>t<2F}Zvj!RjQmjmH(O7z2p>-X)P_>0Wra6Qxq^ttnC@B8pN8E#Ofu zg!NGf7a>xTw1ZL*LIl!I0m{*2B7su+eKgYQ0KifwP(D&FsWeejDF^D&qH2OtxaQB$ z@)W94U|qS<8PS2#2K32MOt_`cSji4ij7PiCq^}ZEpz#{f6sOSA*&MM^c;^pM#J8-` g>;x!NZ=_h#P)i30P;QrSlMn*X1W<05aFgT$$KZi&Hvj+t diff --git a/test/expect/ModelTester.test_maxvit_t_expect.pkl b/test/expect/ModelTester.test_maxvit_t_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..06224d7ad33d19b768c8b058f6887886f777a6c0 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66cPzfb9<154(Pb_V0Ocy4Yr0U9Qb13&Fii)A;tbMg6d;>`S&WH$Jl`kiWua zPut-={Ax37c50jLS@A&EhN))do;UZe?U7n0Xyb5--Nqrr)N(@RY@4^I^ldKinc5gm zW!{_feA(`fldZd@)~>W!aDTgX!tb8lTSBMXZ0tC>hih(|jmB+J+kFuRdrj(ecblu; zw`TlRYMrtvdQVBTs`U^1B-`@!9eaFUoU(q9d(C>qe@UA&Gp|@lorA+@ZzKU=0tgvecqtU@W*fnG+dYNFfJd8gqeczBn&E zlnH1n2nTpGf+%>JM2 CD)+Gf literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_mnasnet0_5_expect.pkl b/test/expect/ModelTester.test_mnasnet0_5_expect.pkl index aaa969d60c0a9a2112e17ea1d2b52d8c0240866b..8c53b303c7d6086ebdc28a1116499b4482c8d81e 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+_sL%QSEE9+NE@J4__+T;AetJ=f$$z_iUbZmOFc=cwG0m3GeL?On&MF-{U& zlk8q^E>AjUvLbxZ<{4`lwiN8yxTQdB;--I9t4-$HX`AfnGd7;{F@Lkjkqbt1LzuT- zJfM>=H#erL;o^IQcz#Fxhp=15}UmTiSA}(z+Vi%V*F+0+^MI`(DmI8;m z%{-o`OtvZS+5!r#OI*FKCx9UZ!ni|=pTQa)T4kw4#lTo_b229~xR62)!ZhXr*?e(c zdMFdnRuB&GW&~02G>IIC0w4(#fSy9pbtC(U4@KuIAP-r$z5%*kWLNQ{=#>Dv5T+Lz z1_9n|Y&uXya?HAL<)Fk20x)_zgv&4q>`9P!*+6-N!4s+glnDa7S=m5h%s>cI4^ayM DqB{3j delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$(x|eDB<~=5RGIp3~-MPFa z$a=2HyMSq%AKX+odBjnvI_nIod>pT7n3z%2%Nu(_nqXHBP;z(M0NZPwo-)~_ylV>pMu(WL diff --git a/test/expect/ModelTester.test_mnasnet0_75_expect.pkl b/test/expect/ModelTester.test_mnasnet0_75_expect.pkl index 44213104b523ffe909bd1ed7f624dd72457d0c3b..af9c22131d4d2ca6ecbb0127efe27e77d7692aab 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66e$0$(t@rVcuZ9V3N^_)U%rwFJar|FhobWpkcX^W-vC`Nva9$}^hy9-2-6D< zg8*+fHXW!UIc8nBa!_Ih0T{g-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5Pdxsx|BOl96+yq4(KMMq-cNZgM*iYs3({&&X%ir%e~6*bVuTI}M}C1ULQR4>Df$@zzET z{i7R$b2*F?^1p0~%(-V25EZ_$JjTTM#oRop%~MYJZ(3(}W7D16=9}jmb{j2kGcvYb zpt^aRW7DQAA$Y`OUt+e4!bZ@w68ZRp+%05-^+FaQ7m diff --git a/test/expect/ModelTester.test_mnasnet1_0_expect.pkl b/test/expect/ModelTester.test_mnasnet1_0_expect.pkl index 01bc90ba95f25f0167ea679595fbca21249aa579..bd35b5b184e3b5b75af45c34a9064af7d85c67a6 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+|ooXoKebWJ9@Ex<(E6ST{CJ6f!6~6}{2z(CrO3ZfoiP2}$1Y<=~EuHJPa! z63h9FEXuMs?3=)`X_MZ3!x_&CjHa1<-`H68X@kEin~{&>p^d-P)^E7*^OZr}2ZK$m zg3GKtMJRDF2VF!=)eMuoN|hNgSmHcn;! zZ1_fP?uIoFPHs?;{GiV!7GWqPAhPjFnEb{EDbo7ac-Rdhb_=ZkQKn+JZ+_RtnI#^E zHfx_4D9kZ20)>|JDeGA$fFT9KxI>Gd!5SV~WvNBQz*ul|GAA;)kU|c^H0A=?d~sfS zC=<|D5DxHW1X1ubi5!OlAPE$JooXBrWYCp z0p4tEI#5M&%(`&ppu`LUFnT+L%P-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0#kJ#HHpFn>0@sWx}Rh6g7%BuReImlBIG6c!NK q_#jMvV?mm<{tO;=1C`wZ>tC0t819(gwQ)g-hoR-#Ck85WOpE}s1C zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~oRBW<#zk(?%bSBO865XB&QypTGV?{Q|=id#4#5%aYhQ?eobEMUTP^JA)1z zu5jhum?Qqmpu=mqLCP7^jh25;8~D7}-1sB&!p1&_2E!N6?F~-NVK%Hd-Mi7ijeUK^ zQ~nL#!VVZ-ub;3nNp#7^n6`3*Xa~oQn-_5!ntC#9*mZQf;m?@)8$L!)GJIQmV?$Bq z%#93UO=}ZQe=yjz?v=p@^|>3=mV_9d`g>^O4~qiBh{sP2mK4|<_8E&9zUDf&;e~Cl z-rjSW8y>yIIC0w4(#fSy9pbtC(U4@KuIAP-r$J}?YWZANw#KZ;%npbKGo zp-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$w;JkK_K zB0qoq`}zfj=k`uBJd!1`an|RP8_FJq8FmF7HeBh-yD>=ol|j4La)bOcrW?cmo;J{b zuetGG=7o(@92yLNJ-0VFGl$u*>~!zO3$A(W>vNy-Z}=K^!0<}_gpF~cOEyNdl^b|D zIBwj%h||#BlVQWoquULIW9M)98a>I7rS8UtqRg2a8zh_7mYx1!uzuYugDdKDH&`qQ sF}(Em&_)K!0>h}sPYsq8*c;9@7BPIzb#6m}ZLi*)bD0}Xy!>VW08FQzu>b%7 diff --git a/test/expect/ModelTester.test_mobilenet_v2_expect.pkl b/test/expect/ModelTester.test_mobilenet_v2_expect.pkl index 42bfdc5c68f1b42c9d36d216aa16ccb91cbeb169..401517ee42c73801925fd1170c7bb677d9573aed 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66bVZzaGa2OWl@ek98k|WA&#cFIl(Y-QG1ftTgl-s=uvm;9S4P>rabb-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5Q8zJ9$KA1rm}MtiJd2#MA2 zNnWz9;qBfvFRV25e5${#UBIz^P1>Ipy^LLf`obj>)~P({)f3xyU9ZO@Vci$u;&p0v zUhAjF&04p>)oT5>AKmK(nh)t(-d?8X!=A^jSF}@FPb^DO-{ixcweh1qfxJkfmN4@M_(^5T5nz1o^-wN pupqt3`+D>^4%}at?&7vKr}y((;f0xc(hSGf6|62>+aIpA9su$|l5hY3 diff --git a/test/expect/ModelTester.test_mobilenet_v3_large_expect.pkl b/test/expect/ModelTester.test_mobilenet_v3_large_expect.pkl index 9691daf18c7c691a970ac8204305a28061a4b394..f662ac3653c58b1be26529adfad00ebf188622ab 100644 GIT binary patch delta 375 zcmdnVzM6eP4=W=B1B1cD`Mx|%76)o(0XZN%K}w8|L5m@=C^;juELA@xu_RI7z|BdG zVR9goT73tv#+oVVDSBFESM=`xm0wropRvv)dcnGa|6Y0xeinL$M{lfKvV8m6bKh0g z=N~a$7c_U~x)AQ)YZ(glwd5Nz*EVIgu8r1wptt$0*}8XMB-U?eJGHLF@b9{~t*&}U z9}DZ5HJn)|QdOxpDc4DF%k&Ce35D}|?;iTBJ;1qGCt(gxpk9Mf)4Cq71A1w4)RcS>W$x?P*^=zg)R zT&E?(s;l^WnO=Z5J4b5H%8tnbOp`d_{-3;>$y5gx2TN+KA=*x3)(KMSCKq(NOAXP8Jz{ikSl$?=S zma3nUSdyrZrp3*Piy;aQF7Ror=}k}3vnsoy$Nf)!U9o@0I-BSP>$3lQ>CN!7(6c&v zV_nJe?Q1W8S6T0K#B^QQ+?nh8xPPy0NYU3aXvkcrdKV)}3^F>AJaU;rfo67uIEPHS5JcU9>Ku<%5pFGud@E`h0qa zG*+yeviXkgC(FupAwsOWYQLB11$eV_+`0DP*<=BxNu2N?nY^0G)Cweg0vI?TECa*{ v@B_%i7gm#bm{YBeLe)=Vf-nNS+1Np>aApXf5l97iv$BDB%s>cI4^ayMBioj; diff --git a/test/expect/ModelTester.test_mobilenet_v3_small_expect.pkl b/test/expect/ModelTester.test_mobilenet_v3_small_expect.pkl index 5373739b53c08f3f2921d5514593672f8b2b4c76..74d63a46ce64314a7659c3ff1ddfa857fe2b0c4c 100644 GIT binary patch delta 375 zcmdnVzM6eP4=W=B1B1cD`Mx|%76)o(0XZN%K}w8|L5m@=C^;juELA@xu_RI7z|BdG zVR9goT7Am4=yiN|f31($xoUmj5eEGWyC1Dzv6^*#+omV$UVLlN-&QkmJ;S2*_4i|s z=|?_yTkm4$z9EHQV11|Tymd!(e(BHnA-cXrQ*M3H(^h?hh6w!`|JSbzIJa6qbSuyL zCGWTDuL%FGV-k5u@5E+41BMUl*6;aRxX$R|{`EUjd)F(u`UdK6t39EAV4l97K^^Q>*p=Gz3$VARr)ol8J|ISbS3o9ez)Y56 iPG#Iaxq(?+$&(qP5EQBb-mGjO9y1Vb2hx+6Ks*3fNsW>K delta 440 zcmZ3@zLR}Ik1#(21B0Q7fvKT^g}IrjnT3Iw$;64iJPGdMw^snAKzM>wy$k~%Lt;^K zMrK*6eoA6VqCT1yHzzKJC^*R97QIgS?yvP>J6ElbJHnuUW%r}?yH>NVFWdBF-P>;s z`iE*Ju4i7YZ!~*>{<2L`>z8|9)|ZFsE7_g{qYrbtv$2C%;mi;|BajO4W@Q8Mn1K+a9-h`3)_}t<>MY(N02uIR9lP<(P9#@4uzF~en%gNb29Aol5DQNUP1D6oJ gfIN7Q?^AtV;pbAkw6TG{<-WvWqq*o`s z^DzLuF4XkBH}|ByK;7a!sJU%E*hbPk)Gow5KUal41!8_Z7RuQ@d1LZDd}#DN4wn$U g(L8v*k1XsxP)i30$tER7lMn*X1j!~PN0a0N$B-9llK=n! diff --git a/test/expect/ModelTester.test_regnet_x_1_6gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_1_6gf_expect.pkl index d9d3216aa579a7e0c589db6e51365719137b01cf..65bdc53fef8b3d07153e3cd604a8938bb96f2420 100644 GIT binary patch delta 230 zcmVZZ5s- ze=v_DW%UC=~Bb_`ny{S8l#umN0 zekHw|m_0lW4RkzSy;(ihhnl?X4kNwzj%B+*cjLRjKB2rkV8u~9m};>+Fj6JD^mE%g zNe~~s*#?k2QG&F*TmqE4bQ>Z)aDlG83b#}|u$h-UlZh}rHm~Bm=v_Dq%UC>HBb_{My{S8z#umNF zekHxZm_0lo4Rkzmy;(ixhnl?m4kNt)k7c_OcjLRyKB2r{V8u~9v}&O0VL)|Ls)0Denusi13QL g!c;)K#h?5;P)i30XrtjOlMn*X1Zbn-DwE^_#{-3L0RR91 diff --git a/test/expect/ModelTester.test_regnet_x_32gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_32gf_expect.pkl index 071e514433f8af85d9fd3cc90af6e3b3b9247ac5..48ae57912223282a535dfbec16c0335e3e065dc4 100644 GIT binary patch delta 230 zcmVRxfGG09qQ02YkDy=wCS)={P-L zTe`bi>N-8lZ+AR(ma4oHhax_Vc9p#7xZOQ-!bLroPlUbExfoGBUZkVE6ccN`Smvue z_688WYN^vbM%=PI>eCZGH@twoPVm({0HQ-XW(DCrN^0l4`iKy|^QI!cn6t^erDs*W gm`BAtzDhp5P)i30VFkC7lMn*X1YrfYlau5E$BsF1DF6Tf delta 230 zcmVRykGG0AWQ02WgE3G}+TEaX)T3Nq_( zUAnu#>N-8pZ+AR_ma4o`hax_cc9pzKx!pb9!bLs1PlUZRx)@PC$fTpZRugNzs^+Ub z2L}+n#HrIgw%oEjD%2A`X}o~F=`FepP)i30RnH$jlMn*X1Xa%;K9l4E#|tQMO8@`> diff --git a/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl index bd745ed1f0558bee8b4293ab34291e2dcbbec917..894ec28c17bf3915ffb1242567a249492cd77b0e 100644 GIT binary patch delta 230 zcmV#J&3>7@+Z$7=7KZ(6AY)-wJUH!e%5Dh*s+UUJ-o7BBh7!$sQK|Z}C zQ;Iwp0kb`mm`=TDg_Avs!?eAudLz7P9hx(O0@C#AB gdWFcmcB^VVP)i30Y+*c{lMn*X1Z-hEoRj1N$20tFu>b%7 delta 230 zcmV#I}3>7@;Z$7=TKZ(6UY)-w+UH!fK5Dh+O+UUKMo7BB;7!$sfd-IO~FR1rNvg9W_;XTv-|2SmF0r+mCNukAcy$uB(O4q?2Gcw)U6l+?Q# zpJqKT`V+m_pCmm%plH0x3uHVz!;ZU2(wDuqt0cV+$QC^Qyc1Eq8_cr2DW9Y~l6eNb zlA%gHNM24n9uN*a{{8^Hmc?y6!k0X}rw8LaOgsiX=9O=}MCEq8>;scLLTKx}FGUYL gc=RE>mf7CCP)i309;MhclMn*X1RkZ>G?U~4$GA~!MF0Q* delta 230 zcmVDFPy!<%%3}$@rJ#moLIcY`6@id zj&MBj1O>fu-IO~aR1rN;g9W`DXTvL)ukAdL$uB(G4q?2kcw)U?l+?R1 zpJqKt`V+n7pCmmPplH0@3uHVn!;ZUC(wDu)t0cW0$QC>hy%SNrKg_bcO`oJZnt2Aj zuAxdjTV75)DG&}l3jYAT!NqMnq?bIrv2_6R$MB?rBhiQ77Gln1-WqJ_Nrcnv+mDj~c!u4lY7 z*|ogm-he!W-jKYW7#}>cc%i(i5xYDb=s-MlK>j@OHIBSg&sn_RVNX2%i4HuLKG-~X zl?1z3f`mNaB!)Uj)3dugwbZ*<2#UIa_p!XxsSCaRM$^0n)qznwQJ?(0(Q3;)?_jRH zXi1DbGU530_6R$YB?rAmirYF`ln1-iqJ_K&c?~_vDj~c{u4lYM z*|ogy-he!h-jKYz7#}>tc%i()5xYDz=s-Mj=fHjcbr&sn_wVNX0GiVi%oKG-~X zl?1zNf`mN$B!)Ub)3dvDwbZ*d2#UIs_p!X+sSCXdN7K9$)qznwXrKJN?rO_C|6s1X za7m0jMdA57R_CHTr3BWybnKoyy#|RqTD>AY`u2)Ei$goRfjQK>SLv3#Q1SA-yKp4E gwLT2IV4ATyP)i30D3#t|lMn*X1Spl>V3Xtm$B{X3c>n+a diff --git a/test/expect/ModelTester.test_regnet_x_8gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_8gf_expect.pkl index 1a2dbabbd1cf5e4e37702b96865fd1af95a4f4d5..bbcf294201a9428188877bbf190c712accb79d38 100644 GIT binary patch delta 230 zcmVrg+SdcMB4Xz)E$fto(B4R5`uW6HgmKPo=qb{M|E#u7eIr8B%>vz5MVLfk$&dJ(^H zD}z3!XSKc-cxgWtDLKB-EE2zt;c~u+&#JzVT5Z0?@=HEXIzCZ8w9Wgz5UnV`!aF%W z0PwiH3f>?;Fs`>e!OZyzXH$KyL|zW@LL delta 230 zcmVrg+@dcM9uY4AOafto&;4R5`;W6HhZKPo;5cNo3|#}Yn3r8B&bvz5NhLfk%TdJ(^+ zD}z4lXSKd*cxgX2DLKCUEE2!j;c~vf&#J!ET5Y}$^GiN+IzCZ82hRJxL9Hmi{X026 zr0=-AMBX4jpsu$(QqBE6*pMVYBV80ez*bJaNWa6rFy|0I3N&Lrc8*`aJUs`#7Lp0R gI}_f%^tjnQP)i30-yL+`lMn*X1m7KW-jn14#~4^{%>V!Z diff --git a/test/expect/ModelTester.test_regnet_y_16gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_16gf_expect.pkl index 520452985badad04866e7e062228bf00336ebe34..d92adad9d05d5aef4478a73dac04d4ba6e8859bb 100644 GIT binary patch delta 230 zcmVU~jwfed#(hw0FDJmXww_KalReA zj|QYWystaFC5&o3nuG0MW2JP)i30G7XutlMn*X1TqbovybPO7@Imp8o1T@kz# zYf(I)ZLB+B)%LqUyQjGTgowAO%t*P0Y)U+&XQ({&y2QIUP;^l{eolisX44zIbG{wC zss^Mx+OIpjLX2uWMuY{s!fb6k5ZHve08YL)1|xO6UOSk(Fpdg5Do}|#Rx$KD?P0IG gFheQ4CDE`uP)i30MK?m7HOXWLFGy*&lIEK7nRkk`pL;E{8UZp%+jxapQT#>v%h?2XM z4T!u!y`8&2(d0OICObU$M(4Xa&#AkYD)hSufKNN?)}T9w@)=P)hF>SV6?%9)D%32y zTYlX;Kvf(%BJrs^Y~uvH$S<(F{IXX(qgsDFMa=uVAo7a30?NO;EwXGp<&G3Q>cGN0 gjM`f}Ayx*wP)i30(!Bn(lMn*X1k$|zw3Fll$97mOXWLUGy*&tIEK8ZRkk`!L;E{OUZp&7jxapaT#>vuh?2XV z4T!u$y`8&&%;Y$UCObU#M(4XQ&#AkbD)hTmfKNOB)}T9|@)=P)lV2yiDSCK3Dby^x zcYfVF!BreOG4ZK8Wa9+9&M&aL7_(PAy;^@fQq23iR`QCv6Ux84O|on}|Be(q{=mXK gq}p3MA65puP)i30V~Q_8lMn*X1Y?RXK$GMG$GC-Sp8x;= diff --git a/test/expect/ModelTester.test_regnet_y_32gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_32gf_expect.pkl index 68a5909064621525cb85baa665f7b60cc4f8ccdf..04b89738b58a98c8870fe7b9c92cd1d4473050d6 100644 GIT binary patch delta 230 zcmV7;N7KC4h#9?FE`q#-4k^2)LXEoSO6fcEQ4mo)aGRMs!}Ah526~OW zk(!pg@!V~^X`Uavv0cABfHxF6TuvN4WlpZU@YB#b64wnpmSJ$bK+D28j;IN|$Zy;{ gTlS_q(5EZAP)i30)TE0ZlMn*X1k|L99+Tt($5SqH4*&oF diff --git a/test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl index c2daa7514d3fcdb64d80b7febad6a2c6c5907cc7..d979038f139117f592b23e115231e699940992af 100644 GIT binary patch delta 230 zcmV_IZBxA1XTG~cx}iIAG%`H# zpRPP|+sZq#h>pDQeSJI}U0J+ROu)MRc)C0uV>P^X_>jD89UHugCIURXm({z#$um2> zXp*~TEN{DxV#+-2T7*0xEg?I|9K^eLeyX|v{Gv95j~zND(all3Bx4G^WZM8ew9x=N z(5ddbXHXVA36mH+pr_8t5>_aZBxAOXTG~xx}iIYG%`H? zpRPQP+sZq)h>pAjetkSBU0J-EOu)JhdAd9!V>P^c_>jDM9UHuoCIURfm({z{$um3E zXp*~OEN{D_V#+-BT7*1OEg?J99K^eoeyX|vM58uDkR3WW(all3C1VP_YTE!k%+UZl z;HmDrcTg5QBa;|Bpr_kiSo4_b&i&MMqH gZYlu02-@DeP)i30A`p$AlMn*X1R@ZPpOfSQ$D@pDq5uE@ diff --git a/test/expect/ModelTester.test_regnet_y_400mf_expect.pkl b/test/expect/ModelTester.test_regnet_y_400mf_expect.pkl index 2082c49cab9f735f69745c124436eb53cfda3ec7..93303df34404db91f6a50a8bf043a1e26921e2f8 100644 GIT binary patch delta 230 zcmVu9g4i_YpS}`8GXB44yio+LNq+yb^tqaYiql_ zT}Hgp!p}O+6OlU~NUFQ=6Cyh%I2gRGn5a9MI6=E;0xi5u(-69r3fH`tX+yl-y$?Ii z3M{uDdnK$x>SF8~9)x)_Q)Fn^u9+<}ifPBGg%LK%`ga3wyxXxi*LCJ|w~{KN>n gkR?hwxIP)YP)i30vbj1UlMn*X1hTn0B9r6-#~hMmU;qFB delta 230 zcmVt9g4i?YpS~28GXBB4yil@Lo__6<^TWy delta 230 zcmVb3hi(?5*6M|oDlTa} g$kVYqM?^imP)i30C^eu)lMn*X1SmD2Mw8?M$HUWRW&i*H diff --git a/test/expect/ModelTester.test_regnet_y_8gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_8gf_expect.pkl index aa12345882f7c6eb2a489240d108af1717fef416..39817c8740d47792a631244f65b1fc275f428a86 100644 GIT binary patch delta 230 zcmVZjwsy;l}wGg`?OGZ39e9}95X!1N-sDQjb zUdlU5=$t&r!#q4nqS?Fv`lGz{bTd5nU|l@);|@E^C>p$5o(H^h5_Y`4!i_t5$ymG? z3Ew;x{fImwtbsiL=!v{Kmw`K;f{8r3=?gqS)^a$`jwieqyBSfv9wrbx7CBM8y0`E< z#xxkatn`~Z3R^@w35^3h1Qa|wnvaaT$h?<4nD@;)NbyfQkh4oVUvT%lO%$9wpI8sP goTP)iU}Y9OP)i30>u7jQlMn*X1nX#cO_SsT$BHj$O#lD@ delta 230 zcmVZkHsy;l!wGg{gOGZ3de9}9ZX!1N>sDQj) zUdlUP=$t&(!#q4+qS?F``lGxJbu&B~VO>1z;|@F5C>p$Io(H^v5_Y`X!i_tE$ymHE z3Ew<5{fIm)tbsfU>505Zmw`L8f{8rK=?gqy&2l)(VicS_rC1NV gqojkpcV!klP)i30n zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66eaFf39p6SB|ZTwRV-Zu6A2@eE-om;j508F|xZoid1o%ci^U*(mh+(Kc+gz z7JQlEHs9l(tHy$uqYKX7axE=Ce)L07?=gYdO>Xs<6kN-eDIN81^FJ1P_26TpxHVcem`&tMG?t+LdjVqh$|Ihhj~Tu31YVH$IRY`!=z zJ(LM(D+mX8GlD31nnaF60gwa=Ku@9Qx{>|FhobWpkcX^W-vC`Nva9$}^hy9-2-6D< zg8*+fHXW!UIc8nBa!_Ih0T{g-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5QazyDmXT3k8S7iaCtXH)Ig zaAN<_B@wHRzF}l{D~(cdt2%hoZNoiV*JWlp$Be$taBK3s=gP4#=4i^fTdtuM$B)(p z_a5t;-Q*T`S;19nxzbUYcK>5~*M1&t*_$`rF>Yy;%aY7PN3X>39NYY% zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~t&PtJl6{F3++btGy4@+IiI*J@QI>+3!^}@72lePx-x!zf_%$yJc|d*77f> z4E`?iT3h@3)FnlG@1kWJPb~=W_OR)n>FK&M>hz=St54TD>^uEM=HAIeYo>UedBE-c ziIK%y>9pGE86VhBi?c63-F;%=sYfa7JAKi- z#(UK~m(y-7hrK|d)h->Cd;%C!AdEY-_!+F>p;eYzR1Ay-Hz#ul>i!MRpZGie3qz3t@Vp zVG!WW#-;;RB*&}^R}M-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu33Ik5(^%R4&hgpR2vE*V%b( zGk)Zy@v7hJ()?E^GoJB#b$qQlZFSq=l+(&Dr_}x}^0KJ^ed?K#z4!Fx8&3s>czYyG znCV%!D(dvD9ji}Iaol%$kL~+T?R54>_R}mJ%TJe_TzKk? zt%*#A)y>I7Fl{Ye-mQay6vv1p!Q@^gV sduK9myOW^aM1pHI%X&FQW5gzvPOWsUdg`7Wn(S`T{x0G27Ay8r+H diff --git a/test/expect/ModelTester.test_resnet18_expect.pkl b/test/expect/ModelTester.test_resnet18_expect.pkl index 6dc83a0606ffe59d1dda22fddfc1998156efb307..d36d94865f342775f47f6f8d39b9d8a3ed97ef98 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=X;|!CrnhtNpU>R=d7)>2`5O{`-{;Quo>Ur|d7A<7g+~xWb-k&TPAR$=vn~ z%w2X~eM{{-ECcN8=8M~ZNm_23wj;%E*PMrT3wrWxzr_FC+qP`!{$-W|_D>@0>|dJM z*cmQR+Rvk1W@oeIvE8azHap)R?e-hWI_=l&T(?hAcG>DF+eOLn?Dwcyy)QOf+TJHW zeea6IJ$9hbikNJ%@&quXKp1yu@iSP%L#r&cs2CUvZcgSz1{YGuL72u|Ae%4FOAloN z+6uw}-i#m$o+gpwPyi%>0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H!yv$$ zjZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y9?k5p delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32?6@$IQZdUtK-K};<&!yY- z8Ts$eG)Uc7?w_)M!W>6COUD)VI&)^*?M~*l7iR9V%k5igf7mj>e${+&d-3Guw$pZ` z*nOPy(5|&N-*#8R&%GO$E!}_9Qox=qGSAN5%-qJ#Z-LT&Z|yR>#x0NS_QkT<<^5>4 zKU>yme`)8seIByQ_Wv|zv~S>CYgg8}ecuE=9ox0;v+Z6USJ}Vv^(Fgb*QVLEXh`p$ pD7tl@`Lh=L zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=Zax5_@ZRe#bf1;RoZ~0v!%a*tXwdp~eAKnca>@3pO426s&*nThl}bL%mdo zb=&_RSh1q+K*_06N1K`T4i5UW9EumWIBd3`?dWvn-a)oEUk)~WFLB)Txa;7)s;!O_ z8gAS3)F>V_SbpF@3wPSVN9F7X?dEbHblD~Cn2^bRsO4?T!PSfS53W6!e_)P9)Vkvn)Q!^vUBHz~(F@g4T(^Aj9jwYNKNKF8%4yp!ip#1zc~U7r;WyxMxq z!9RlkU{!?meo$z=>0hLH0vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu33y<0bZW?);9DHsJ>kxdl3C zPTaQNa-qh7a+%$Z8ikt<*oWvJjA)+dkgb>Mpts}yfhQ~K4tzaT>iBAAy+fS-EQj-p zTO3#&W;^b@a_?aAn=c1{elKx+`?%|%Sasf3$M}ZZ_Bl0*2P>B!IPicw?Vxc5`@z7u z+y`gxl6IVz$$jYD+m?eb7x5qbdNBWhm}S;MUk7zZn=4ZtWnASB3S>Gty!a;NcvgIe ry-h)aqf$q^Bl~$S$EiDc4*i>=dEnt^g##AbjyZHi@E`OFx84r`lIfep diff --git a/test/expect/ModelTester.test_resnet50_expect.pkl b/test/expect/ModelTester.test_resnet50_expect.pkl index 0428c5188df94d3c5280a74ba98ec98039c3754a..b984d4af10a637c77724a5e682234fa1d99c19b9 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+|?Q)lv85%!8H>10BA!ryQISE^x5+ezC&?*E9#2gtz+z_}3o1A$rdKzgAAj?dX9w>$n_OM5gc0$WU|)DrP;nBIA^U+Uk`D&rMwFc>L+B1K-?F z9`M&&a?ma>;^2*~^$sSyx(EFh-*T9_be)65_TBpqI83zLxi9MA_PvuF1zy=4WZt;= z;1t8oga6-FI)FmU=3ME;6TpxHVcem`&tMG?t+LdjVqh$|Ihhj~Tu31YVH$IRY`!=z zJ(LM(D+mX8GlD31nnaF60gwa=Ku@9Qx{>|FhobWpkcX^W-vC`Nva9$}^hy9-2-6D< zg8*+fHXW!UIc8nBa!_Ih0T{ga delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5PrZdb>WmopD8c^K%B+>vtd zWw^jWxd+7#zg*KCxDwy)Pvl>FP)Y2ZJ>vmo$FQn%jvFJcI|?}E9Z1x+cF=jc#UZAf z<>0g4jDzdWDLVeV(d4MYWx0Rj1WCvGRe3Uw>$=Z4m^d7D{8sAau;<#*1BUCl92Z5V z@86oC=-5=udhlL^}{yM$D_9eY;sL?z%_$s}{;IKS`G>*HrH}X+DKsvd(96}AeqQ?L| z)B{F6)frelO<*3q5{^B+?xw6f)|XPf*N&7ti1?s9U|hL8e(h$w2NFHJYD#)L8NT&A gz_F)0C=~HKP)i30d*N2(lMn*X1bg9D*H}H}X+D$UnKcG(ru$zQ+JP z`~yZk@EKS=SYRH#E{;9D1*fb$0+>?0+K!YwviP7pgFv6x-2^?qs0%zu4#qqs*|I#q?SQ;QH?6x) zb1OYgTC+Tn_DVgyhnqbZnDsj+l2yF8*Ezfd^7*`Wi>SOvf!9&Ghz6WG5`tGe^6;Fz z6xUTev6Ha8a8Ecoz!`|Uu4pnmDU6N0z~|SxGUWq2fZ%7lZZJx`XyEreVoF&(Jn$hq g*nP)i30BgMG?lMn*X1S7?`|C8hb$2ib*CjbBd delta 230 zcmV6tHC@}5fM9uTy#7H2{XO1L%+OQp!vL9 zF%&)Vp`N^6u(rJS8Rk51cJI6h*q^%e>Fv6&-2^?*s0%z+4#qq<*|I#h?SQ;KH?6y3 zb1OYqTC+T<_DVg_hnqcunDsk4l2yF=*Ezf^^7*`ui>SOnf!9&G$OfD`A%a&t1o51_ zSJzcMy_2xKAXGRy@EM4^=V&rKI*g6H#OK$#Y~=$y2;pbD$}md2f#CN%d`ek8?C>Ew g5Q6}_ZFjl6P)i30q(InalMn*X1f)RNW|QOs$Kefgm;e9( diff --git a/test/expect/ModelTester.test_resnext50_32x4d_expect.pkl b/test/expect/ModelTester.test_resnext50_32x4d_expect.pkl index 4e5dc89c496e36ed21836dad9cad31e772ca13d4..3f2b0911c22afb2020a7ec2ac59cc4a7a6468c04 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66fgBCfkIkxi%|2s`r>MdRbq}?X;P*Ps6rg-kLo-)|&4X(O1|LQux*Sk#6*! z`PUU}7EJiR=Uic!?T*XSYznT&+3ME5+w(!PWlx4fp0!Li``+5qhI{9vPuLsn(q*$` z$(LPq_jC4`A6BqAu(ZKC$K}|b){FOTB(B`rGsRqJ&#$iMd-isP*|{K9M=teIIC0w4(#fSy9pbtC(U4@KuIAP-r$z5%*kWLNQ{=#>Dv5T+Lz z1_9n|Y&uXya?HAL<)Fk20x)_zgv&4q>`9P!*+6-N!4s+glnDa7S=m5h%s>cI4^ayM DWq|$t delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5P@rA@YJPjhXqcvSB>!{}vw zF1OQW+dd83$@A9iIltC?uZX_Fo`k}$)(m>lduCl%uvtFg|DFehVYcfoPqV4J9%t)R z`)Kv&ZzXg3Yg`4c4VD$M#IS zc;Ck4%B?+@%!T&wcR$~AtSihW_}N99d%q@G%f)E#VF+v8G0W(rwbjmF)`zCZ+ZKEi s*yZLIW@EBpp7rn79X4lXsPFBlFy4LYUGyHVl9e`hmULN9IJ;{P0E$MU3IG5A diff --git a/test/expect/ModelTester.test_shufflenet_v2_x0_5_expect.pkl b/test/expect/ModelTester.test_shufflenet_v2_x0_5_expect.pkl index 72af3d712ededeb26ac1feef865fecb6b8dde7e8..a220e2fe1ea36ffa9e084083477c61f6b23cdcf6 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@-9>-Fvc)mh3rps(H_{%M)$Pzkjp&DAQ~8+QW0V=!ehNTNi(}k!?R`-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CIN#c>fho3-Ap)CP#zZ|Mpb;Rj{p#*L*+xtD96Qy#XW8Y6 zHfGK1-K~}TcSYH( zvXR+)#5&;k2b&f%X{*{IUB5o5`Ng-+T7#?qu83@`z#2w6pv+Ci7PAIehW$ yo}hJq_S{~?X=7Q~wde5cW}CTQQr0&o-({2*SbxQ9w;|JO8=+G(_C)R3Y7GFEPKt*B diff --git a/test/expect/ModelTester.test_shufflenet_v2_x1_0_expect.pkl b/test/expect/ModelTester.test_shufflenet_v2_x1_0_expect.pkl index 6feab9857bba6bd1825ad48410f0795c6d584bf1..755931e85af3d536e383dc1745c0f34c077fa1ca 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK62~E>&?dR{rp;WJ@7A*__w5c3*0WIzi`=a!Vrp&Y-?Dr8_Ukq+64&-zIB&AY z;q3Z7DRR5_eAb(>XV;Ba*0WPi+B9G8**(*~+GfVnP1YK_`)o=k-M88H|DVl5wq1K1 zRA1YiUf*J~*87MJ3s?D`&v6I#yf}H>ra&)u_q<)_tO~9w@A3ZAW8m^u%I>Kb z7VXj0%C#1+cxPkgrf0K3tatZhhDm!ie=@PDpLck7dLe_&mzfndKW1#&<1Bh@Pu{bQ zHmvPsd-&F^-UAA)Yp#9sP5?s+gmH%!KZ7+qw8~P8ih;4<=44J}a3O^pglWtLviah? z^iU?CtsorW%?P64X%aaO1waxg06m4G>qhnyABxUbKpwJgeFJp8$gbi?(JKLTAxtkc z3-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_hu zH#ajVWCrU>E+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu31*N})}1>rI>-uiQ!gyqqp6i^Em85##?(#EW`kJo?#T?3 s_H6!SVpBKo@b0ui2AeN4D{Owu*tEw*^xB^MXB%x;+spRwty{eZ0Q{w(B>(^b diff --git a/test/expect/ModelTester.test_shufflenet_v2_x1_5_expect.pkl b/test/expect/ModelTester.test_shufflenet_v2_x1_5_expect.pkl index fc36135789fcd8a17b8a97891747809442759b4d..7ed7d6973710904620cea3c3f339e738deb1a0c7 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~q5d{~pcynmuA1vuqgKUG~hp{$!6)j{2TsTIXzJHgZ{gwft{YdFaHRYrzMs z4Oks)%yn1pKIpP#&(<%6RwZXH>{)tp-ku+8FWcxa@a?KkerByxd~Z*RsFRKVi9@@s zr`g*axO{w1$(ltrKl653+f0~Z6CSd2k9y0`J%LP<_o&Fn?y37YbC3D{Pd4_i=I=Q* zy>-u}ZToCeljiJMlzn(lZs~$u6;tc?q!^yE(tFxwHLYsl9&g)kd-(c(*ciR7u0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H z!yv$$jZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y DJ!=0l delta 383 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0%|I{!Ud^)-7$IcC`~wY%(@ zdHu;Aqa5`;$Ffb6_@1IQi)?=6?XtF+FvTV; zWal3BmY;h9nI`X1m5<$1JNY1^G-KZ6yNu!@{Cz)cjNVq*_(fjWE%tSib%WOeYXIZ1 BhrIv* diff --git a/test/expect/ModelTester.test_shufflenet_v2_x2_0_expect.pkl b/test/expect/ModelTester.test_shufflenet_v2_x2_0_expect.pkl index 1af1049efbce650a979a3776d0e2e007fe0eb903..d2ef01fc6a5ef664aba43a8cbb26b6bb216ffe3e 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@(f$yN#_z<(~EH6n0C+N7(%EUb$!OMz%eRR2JFH-5|GHLnm<0`-Y3Ve5|JJ zaVTiCNr~TM{d3~R-Rkq0Y?SVtv^FzsxAD8hV-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5O74R;$`jmkai)+y|kjE}JS z;k|Ot+Kp^`7O5<<+b`rX~VEZ%s}^y-j3ERo?h5r0)| zICLBKbe?G2vp|(^k9N|`J?HvZt>2wpVe{?vl-=6JGpz-TK3d;sIk3C)3dbHF2kAW% JC% zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66ZYAMEk8vb?qx+`>lbNFswTC+73i7bkwpx@#7_!9jg7q?xx{#+f)78_8>V$ zJ3;$Ti;8SNGHKgR+R|xv-J`|6gRuy#-(H(vV( zYyW}GtkZsLeL$1Z?%3K%U^yxKV7qQ3SFqeW4k54@qT}I zKZ;%npbKGop-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_hu~PE%up=MPU7r zY@T)<*$Zrwvhr%divP_Jw-56)2eZ$(@!CIH`wwjBHto08=QJ7ZPOO~-mQ%A2wre$V z1MTULt-yWW)yV71KWm258>^V zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+|~_(e7kSp0yPB1*=0Mmn?x6F=T#}vDtocv&~E1KTeq$ZKX`yY> z*VT3)`KNjttw415m;H7@FBI&8)oO zy&#M`wD=jU;h|NQT2u^-1ve*iB7+MlqT}IKZ;%npbKGop(^b delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$caiiUdm^^E1?h97SMJ`!_ z#PUDN*c`jK*+!Rdxz*XXJ=Q!SZ>&=`EwoMgy4nsTe@kzp6^QQrvfnP`g@Rq8`fXe5 zz1p^Ex5dEXUfqvu7#Q*x82&mRwQbt&VQXGnV!QOyB^$2kvbM^_3AU{koK}3cJ8auk zcUiStmDucGC2xLs?_1laOU-Pu+^uXJN_A}W+EnbUk7`=EEiko-nd@saZ}JA)({{&f ak42i3EUID$n^^!h$@y@>g`Jg`p zyZgRn)jhvxfxx_i>KeX-sfoVR0TI99ZM{CgDJi~yC9FT5yiz{%mr%ccNV&aai2}ZG zD$TwGTN}UhUN=7!77sqAYqLITnb*CLmRi1*&<#Jo{U<*#mts-A4-GB9IF8Z26nRZQ zC14!C?yiYHR}kqwTCO&}`vn2NEExp9d>co1WUID$L^^!ht@y@>d`Jg}e zy8FIW)jhwkfxx_M>KeX$sfoVQ0TI91ZM{C$DJi~wC9FUCyiz{wmr%czNV&agi2}ZQ zD$Tz3S{uLjUN=7g77sp|YqLHGn%BLlmRi1+&<#J*{U<**mts-AA`LCSIgZi33wcdH z6JQ*_{H}>VYY^!^bgnkO?gatA9vK9`d>co<&oocJfTvPAXo9gn#LAIB9#l)dtB}(^ gn#Nf_k|dD6P)i30v5x|SlMn*X1hJ0-gOlU}#~y`h-2eap diff --git a/test/expect/ModelTester.test_swin_s_expect.pkl b/test/expect/ModelTester.test_swin_s_expect.pkl index b8b31bab413cdbd970a04a6ae1b8203ac12561dd..89a67e0d61c1676fc902ed8e123e1affe5885599 100644 GIT binary patch delta 230 zcmVJcQrA=a(F3KEaLURuw++_8MWw5S|EskW`Y@OEfFbUuZ?1q7x(ZKRDq%w5#I6v^Jb glwR|`hp(GHP)i30G0b`*lMn*X1ToBdB9r6-$NhM8K>z>% delta 230 zcmVm5k(D{Hr z>k%Zs9oDVB`w@@6fLhPJ+p&DVu&5kApth~P_I7ALW{m#sh9a;m;=nF~Lp4GBL;RqnpL9&J&+!jR9tVA0|}LCJhS znu9q%ZaG80X#3B;>^b+n=p6aIpje~67hRgZhRbulbu|h~KLS8Pv gs08;sj!bL5P)i30Y1*ESlMn*X1Zmoyj+5j9$4O^#jQ{`u delta 230 zcmV${#<4f``3eqei}hI~70Pb(B8A$OOOlYo5NM$CW~m#sh6a;m;)nF~Lt4GBL!RqnpV9&J&+#*ojxgwf(YL&Wc$y)*g5yT@ErNQwOFIR6kVFWaLaSPd^HL`Jc7f%t??tjfbgI_MqVzz g5Ciu-;7n`2P)i30usa|4lMn*X1h6|F_><%U$F)gsX8-^I diff --git a/test/expect/ModelTester.test_vgg11_bn_expect.pkl b/test/expect/ModelTester.test_vgg11_bn_expect.pkl index ba3db474ae24bcdf4cebfb4c7b579767d223b241..f85d5c82ce5f8418f199fba3a7f20518619bcd94 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK634~eZf}RciCw(5akjTtWb9S&RnSb{N?C-aFA{}hQ zV!VCN3H4UnI{T|O5qJ0OF31nu?Vy#mx5AyMFEMwC)+W1IX?Ae-8 zX`9?3ZhQRGi#oH-+{v3k`z+ zZ#Fg^s3JLLUAS^kVg>;iy&b}3m<0AD$h&NyyusiJRRGEa0p6@^ATeej1gVFp1ppnX B^&tQN delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$tyWL(5&J(+uY~yUzR%Yz2 z@K&(R{=43$;j)JHm$|Ce8JU0gCLHd!+8`5bQ)j$=&u{fs+fVjaZA|X$+0Bz5xOb|F78;^}a{;u6Se5CdUb;R#^e7 zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=Y0=Zf^kRiCw<-akhS|Gxjd>QLt71zusoS6%A{X`Ks3Ev;XdGIM8qPQa0E| z&UE`8NzGQ{VK$W7Du;iltP>MjI<>i#^`i zm9~ss;h=Qj{wg delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0${r`=u+wiCOy+sD~o)2lG^|MRNY`eQ==P>X=-xjgIN|J+YdtwtpP2+8nvJXSYmY;O;*< zX?r(#a$9?pY3%J?qhsT+V2Y(f#zvbE={$=)Te2%{ExW{R6~4aMvuvm7-YuWo_uTTX zwrO@U-8?gJNO;`M$ITyS3q;*Nz rbR99=%WJ#OM&Ry0>*py2wmh%S+3xU}y!%_A=$;2> zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@-GixxI?Eowj@KL|Y$US-$5r<4T)0#dkJoN&L36J9+lJ*l}sk$MwQ{JpR75 zvUdyK^FmY5y5a9N>rM63Y%ZP1wo&-^W{+Ef&mN~Uc{Ujuu6qxJyxFb5ZEjl}HN#rT zyV=IO@6R5;S9A83#w6~2qpGoYm2CT-zwPOJ8|pso4p}H?v--b{ZP5AV-8Xh}+bmgZ zX?--4)%L=!S5_I0DZA$c%I;B^_Rwb2nJqRw6Pot0q&VA%l+4@HGd!5SV~WvNBQz*ul|GAA;)kU|c^H0A=?d~sfS zC=<|D5DxHW1X1ubi5!OlAPE$JooXBrWYCp z0p4tEI#5M&%(`&ppu`LUFnT+L%PH4Pt delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5PZ6LNc_Z98p4?nYa$TUowG zh-sw_o6dizIRI9r`;D8%Gtd6Z)4kcu6cLPE^Zr^MV8hlGFfeFcD%BxaZK4A6)3xB+O&r@ qf6r{ONt@8LCppF0hOuPco@1WLHb$TH_Ef!Dy?2X;uI)3nJ{tgd)0{Z~ diff --git a/test/expect/ModelTester.test_vgg13_expect.pkl b/test/expect/ModelTester.test_vgg13_expect.pkl index ce387c2ffb75cc0ae8e5a299cd663c904c7781bd..b58403a951709bef18d85783eee1d6d9348147e2 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@*9HxxH`fI&BX;h_=4CrhJbb%SxMEm3KC7Dg3rZJv@8f?7p;T?ndD~SN^}X zlJX4RBcLs4ZNPBdy0&qe&4Sa}HnNOw_b4X&?5RATXY)?Wb#F-co84`^=C-$@XILNb zX|^ew_-9Ydn>l+!;uH4@YiR5>k#FC_(Vf1RtLfA3Rg2|p-ZI+Se!bAV`{8bG8-Zn( z*6G=-wgLNISur@L>|PfvyXVb}hc;KwZLz7C)U>BC&Dln#Y~G$z-pMu|U-b4AzgxZc zgqN=EDvmxIP-q1SZrgDJ7*ZgNJGA&2tl^-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5PTQ*wKy>^p5Q+>5sUu%>*E z1ItRA|0?fnd{X#rZ*}wRvEFlO&*_cAdm@V3?Xuab9HS0Hx(`-`CWZTSS ze7naa*=Nt`^LaKJ+OB&Y!rtsY#%pfd8hVX39{!faOC2fJTc`MIR*)(e*1Q#13S rjn(-rHf58V_RL6gwqYrox5vUK+2-pPy*+E*t=`M&t!w* zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~uf#pKTaFldZD+f;~IWYucVL+p=fY_7!_R{99mUm3zV3?VIi1Mo*=^$BwJ- zk-21U`!3}6Zqt-oHaA{AwK3S)x=Wa`de1`N)ix>L3~g3>^4pXcWb8S%-PBg0e!u0B z-4@mo!4qwBSMRiuo4>|JYtjsBompyonx7xHY52CmhHw1|n*%pgcArz~*v%npz4wII z{$0*eVS8Fte%h=&|7MSw)#lv}ekxWjo3_~8e*AFHujs{lQg;5cxwZJ*ZWHg9yT6~2 zxBd3xq)pD63>#2rNvv7$`vfqgKp1yu@iSP%L#r&cs2CUvZcgSz1{YGuL72u|Ae%4F zOAloN+6uw}-i#m$o+gpwPyi%>0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H z!yv$$jZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y DE=2v9 delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5P(H-5I;_?c{L=Rcc8i_h&|;{9@W@L73Vy`Lv-g4bl&006c(pS%D7 diff --git a/test/expect/ModelTester.test_vgg16_expect.pkl b/test/expect/ModelTester.test_vgg16_expect.pkl index 0e2e4fe0c01a6c2510bdfc87bad2d3fb0ac934ce..f2bde18780ee252c1a6e38f6b56fbb6aa91228d8 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=Y~MpX~-gCfiMl3-**>(zLx{xn<9q*wUOA}x{H~mde6Fm)i%?98rrPz;kQ|4oU!NnE>qiv`TH$7 z_FGuzg-*2nv2Lf$?S*S>(x%R^el|yK5990OHc3A>*yL?GVYA`3%I+`99lOO8toH`^ z@85M+Hf+xfjh{Bfm*4EUV!L_wf&djO{;gYVjz536=WN{KJ;(O`vq@TZZud?9m%H1} z%iDVWIcf7|eTEGvv>4XEyM6)~QXq^wwD=jU;h|NQT2u^-1ve*iB7+MlqT}IKZ;%npbKGo zp-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0%Z2R~abAtqZDr3HI#UDC9D zWVvOJ#jX{59s<)`ub1Xy(vCQdv~8w-{W@G+*U2}_U<3)w`>&NJ+)EU)w;`p zwR(?J;A)$BKMieW`0(2pm}KnH+ihyAF*|R+<*WS`)@MT}+CEsf(`NCaH8wXU&#=y# ztF}k*^>LePKR4KH+jPR_-W`?Q0V*B4vlXoOKJePVi&ZXckAvnUnwF1Ak81yj`DR0|571pxFQb diff --git a/test/expect/ModelTester.test_vgg19_bn_expect.pkl b/test/expect/ModelTester.test_vgg19_bn_expect.pkl index c20b7a22247bc3cbfc498feb322fadd5b18e662f..e4890867afe31a638255e88aca6862f2cb7f09d4 100644 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~osdfz3n>HJdHhnQgn8H`*kKp4xM4(q$W$*gl&8AA!9BulV;0lvG+#a7-dqJTUI`LS-31CQpFz(ReXRwBcR#|FMF)$X~oXm*~E~JoyFparDHeZ~V z9?Ar?6@&x489@|0O(Msk07wD_pr=rD-N=68L(%yP$V1kxZ-A~B*;V`~dL@7^gz1Hb zL4Y?Kn+{Zw9J4N5IVdrM0F2%a;WA7DdlKYbHc;MR@PsM=Wr6^2RyL3rGZ2E*L(~EQ D3Cs61 delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huj*WCrU>E+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMt}gP&;pwY8frGbuQS``w`{bT zAbM&~)8xxGvts*fdVB=-D!=02Yf@Bc?RVCCkBiRc-FvK$*!l@>-eVq@uqW`XpiP>y z<(}WgMm7xhXYZ<*a%;EKWyZY>74=pZP4l+xIpxV>+h?!3*Ij< zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@(yp0vlZ&HJgB2%(gb|8*Sc5p4t;X{j!ZhLZ6M1zrfy4Z}|5zlvP@Px?sI0 zK!5Y@6#FB#a-y5}>_|%3BmF_pCeht;&xvv)n})lycL~h6wR_1`#=RHn>#Z^@x9zd_ zVX@ueq`KEkY3iP|Cp&G*;&$(uxy0YLA?%3NyY>w>@*maq+7!s_y&}kG+i=x!@4-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32&=mMK$9W|R}x0r3uwQsc1 zmpZlQ>-5VuY6*QdZTiYC-s|Jy#BFu<6q7s3xGpC$1A&jL|H%zFVu6mQ_fhILFu z9#jg%aQ=2e7(aZ(>-;fAazu7StKAMoSfD3FcA6W-?DXKoJ=1Q)>ZX~)@Y2Y{z@1mb gJ{V=hhkPqUP)i30O@v9tlMn*X1WklV#*^d%$9y<#1^@s6 delta 230 zcmVafoa4*SGPhAprA#XDH_G@3xGr5$2E%>q$FF?<0;z;EEhOm<8~ zpHvFOPycp8Z9sg)ru{KRbVYVVC*BT45}_wVBAXk<;Pv3dB-C!ihNqds2-L{LVxCvT g`xs@!0DUV&P)i30lkx)OlMn*X1e5XtD zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66ZrBlcU+Dbq9K$ecYe(PVnHLjS~+{%wOQpKl|2!Pfm#k9;UD!d}o$?P{(zJ z!>;b=gDg%<_it;BcT{cfcIcWteLutJUi;%O+#I-qWgMl}mODJMn(FYtf~2u7mc=<{kir*4vr(drtsE3WRZo7C(bEJhaMEi;97<;O1mbWN;yc9E54i1+w|# zy!22epsgSr;LQl4;As*$4h29GC;&Z$qU%QX6CaAsS3n-JZhZrEy~wWON6{+*bRkSH zGz-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32I6O-eyP3sOkdG>Mt)^~yj z6E;mez+JGwL2Aye16Iz72b5D;57w9`ADrd7!ojpB`rvk_rTgvL;vE;YcROUynZEz| z=U#i+mu?OT!7`2))|NYXSWk60YWUMm&?ryKf$9CR1D&aE`x~zdI5w3q9CWZ&-p}*k zoP+!_{)3_SH`rO#Z9TaAQ2fDjxo-~GAJ*Ofe#zSd{R<}_$Q9CZob}^|S~m9p0Ji#~Hvj+t diff --git a/test/test_models.py b/test/test_models.py index d42668cea2f..5ab0640a70e 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -594,14 +594,6 @@ def test_vitc_models(model_fn, dev): test_classification_model(model_fn, dev) -@pytest.mark.parametrize( - "model_fn", [models.max_vit_T_224, models.max_vit_S_224, models.max_vit_B_224, models.max_vit_L_224] -) -@pytest.mark.parametrize("dev", cpu_and_gpu()) -def test_max_vit(model_fn, dev): - test_classification_model(model_fn, dev) - - @pytest.mark.parametrize("model_fn", list_model_fns(models)) @pytest.mark.parametrize("dev", cpu_and_gpu()) def test_classification_model(model_fn, dev): diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py index c7a17706bce..838a6011be9 100644 --- a/torchvision/models/__init__.py +++ b/torchvision/models/__init__.py @@ -8,11 +8,11 @@ from .mobilenet import * from .regnet import * from .resnet import * +from .maxvit import * from .shufflenetv2 import * from .squeezenet import * from .vgg import * from .vision_transformer import * from .swin_transformer import * -from .maxvit import * from . import detection, optical_flow, quantization, segmentation, video from ._api import get_model, get_model_weights, get_weight, list_models diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 72a9240c0be..54d0290397f 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -1,12 +1,15 @@ import math -from typing import Any, Callable, List, OrderedDict, Sequence, Tuple +from typing import Any, Callable, List, Optional, OrderedDict, Sequence, Tuple import numpy as np import torch import torch.nn.functional as F from torch import nn, Tensor +from torchvision.models._api import register_model, WeightsEnum +from torchvision.models._utils import _ovewrite_named_param from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation from torchvision.ops.stochastic_depth import StochasticDepth +from torchvision.utils import _log_api_usage_once def get_relative_position_index(height: int, width: int) -> torch.Tensor: @@ -20,20 +23,6 @@ def get_relative_position_index(height: int, width: int) -> torch.Tensor: return relative_coords.sum(-1) -class GeluWrapper(nn.Module): - """ - Gelu wrapper to make it compatible with `ConvNormActivation2D` which passed inplace=True - to the activation function construction. - """ - - def __init__(self, **kwargs) -> None: - super().__init__() - self._op = F.gelu - - def forward(self, x: Tensor) -> Tensor: - return self._op(x) - - class MBConv(nn.Module): def __init__( self, @@ -65,20 +54,28 @@ def __init__( _layers = OrderedDict() _layers["pre_norm"] = normalization_fn(in_channels) _layers["conv_a"] = Conv2dNormActivation( - in_channels, mid_channels, 1, 1, 0, activation_layer=activation_fn, norm_layer=normalization_fn + in_channels, + mid_channels, + kernel_size=1, + stride=1, + padding=0, + activation_layer=activation_fn, + norm_layer=normalization_fn, + inplace=None, ) _layers["conv_b"] = Conv2dNormActivation( mid_channels, mid_channels, - 3, - stride, - 1, + kernel_size=3, + stride=stride, + padding=1, activation_layer=activation_fn, norm_layer=normalization_fn, groups=mid_channels, + inplace=None, ) _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels) - _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=False) + _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=True) self.layers = nn.Sequential(_layers) @@ -116,14 +113,13 @@ def __init__( # initialize with truncated normal the bias self.positional_bias.data.normal_(mean=0, std=0.02) - def _get_relative_positional_bias(self) -> torch.Tensor: + def get_relative_positional_bias(self) -> torch.Tensor: bias_index = self.relative_position_index.view(-1) # type: ignore relative_bias = self.positional_bias[bias_index].view(self.max_seq_len, self.max_seq_len, -1) # type: ignore relative_bias = relative_bias.permute(2, 0, 1).contiguous() return relative_bias.unsqueeze(0) def forward(self, x: Tensor) -> Tensor: - # X, Y and stand for X-axis group dim, Y-axis group dim B, G, P, D = x.shape H, DH = self.n_heads, self.head_dim @@ -135,9 +131,8 @@ def forward(self, x: Tensor) -> Tensor: v = v.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4) k = k * self.scale_factor - # X, Y and stand for X-axis group dim, Y-axis group dim dot_prod = torch.einsum("B G H I D, B G H J D -> B G H I J", q, k) - pos_bias = self._get_relative_positional_bias() + pos_bias = self.get_relative_positional_bias() dot_prod = F.softmax(dot_prod + pos_bias, dim=-1) @@ -204,34 +199,6 @@ def forward(self, x: Tensor) -> Tensor: return x -class MLP(nn.Module): - def __init__( - self, - in_dim: int, - hidden_dim: int, - activation_fn: Callable[..., nn.Module], - normalization_fn: Callable[..., nn.Module], - dropout: float, - ) -> None: - super().__init__() - self.in_dim = in_dim - self.hidden_dim = hidden_dim - self.activation_fn = activation_fn - self.normalization_fn = normalization_fn - self.dropout = dropout - - self.layers = nn.Sequential( - self.normalization_fn(in_dim), - nn.Linear(in_dim, hidden_dim), - self.activation_fn(), - nn.Linear(hidden_dim, in_dim), - nn.Dropout(dropout), - ) - - def forward(self, x: Tensor) -> Tensor: - return x + self.layers(x) - - class PartitionAttentionLayer(nn.Module): def __init__( self, @@ -282,7 +249,14 @@ def __init__( nn.Dropout(attn_dropout), ) - self.mlp_layer = MLP(in_channels, in_channels * mlp_ratio, activation_fn, normalization_fn, mlp_dropout) + # pre-normalization similar to transformer layers + self.mlp_layer = nn.Sequential( + nn.LayerNorm(in_channels), + nn.Linear(in_channels, in_channels * mlp_ratio), + activation_fn(), + nn.Linear(in_channels * mlp_ratio, in_channels), + nn.Dropout(mlp_dropout), + ) # layer scale factors self.attn_layer_scale = nn.parameter.Parameter(torch.ones(in_channels) * 1e-6) @@ -290,8 +264,8 @@ def __init__( def forward(self, x: Tensor) -> Tensor: x = self.partition_op(x) - x = self.attn_layer(x) * self.attn_layer_scale - x = self.mlp_layer(x) * self.mlp_layer_scale + x = x + self.attn_layer(x) * self.attn_layer_scale + x = x + self.mlp_layer(x) * self.mlp_layer_scale x = self.departition_op(x) return x @@ -386,9 +360,8 @@ def __init__( p_stochastic: List[float], ) -> None: super().__init__() - assert ( - len(p_stochastic) == n_layers - ), f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}." + if not len(p_stochastic) == n_layers: + raise ValueError(f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}.") self.layers = nn.ModuleList() # account for the first stride of the first layer @@ -424,11 +397,12 @@ def forward(self, x: Tensor) -> Tensor: class MaxVit(nn.Module): def __init__( self, + # input size parameters + input_size: Tuple[int, int], # stem and task parameters input_channels: int, stem_channels: int, - input_size: Tuple[int, int], - out_classes: int, + num_classes: int, # block parameters block_channels: List[int], block_layers: List[int], @@ -450,6 +424,7 @@ def __init__( partition_size: int, ) -> None: super().__init__() + _log_api_usage_once(self) # stem self.stem = nn.Sequential( @@ -500,7 +475,7 @@ def __init__( self.classifier = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Flatten(), - nn.Linear(block_channels[-1], out_classes, bias=False), + nn.Linear(block_channels[-1], num_classes, bias=False), ) def forward(self, x: Tensor) -> Tensor: @@ -511,85 +486,87 @@ def forward(self, x: Tensor) -> Tensor: return x -def max_vit_T_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, - stem_channels=64, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[64, 128, 256, 512], - block_layers=[2, 2, 5, 2], - stochastic_depth_prob=0.2, - squeeze_ratio=0.25, - expansion_ratio=4.0, - normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, - head_dim=32, - mlp_ratio=2, - mlp_dropout=0.0, - attn_dropout=0.0, - partition_size=7, +def _maxvit( + # stem and task parameters + stem_channels: int, + num_classes: int, + # block parameters + block_channels: List[int], + block_layers: List[int], + stochastic_depth_prob: float, + # conv parameters + squeeze_ratio: float, + expansion_ratio: float, + # conv + transformer parameters + # normalization_fn is applied only to the conv layers + # activation_fn is applied both to conv and transformer layers + normalization_fn: Callable[..., nn.Module], + activation_fn: Callable[..., nn.Module], + # transformer parameters + head_dim: int, + mlp_ratio: int, + mlp_dropout: float, + attn_dropout: float, + # partitioning parameters + partition_size: int, + # Weights API + weights: Optional[WeightsEnum], + progress: bool, + # kwargs, + **kwargs, +) -> MaxVit: + if weights is not None: + _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) + assert weights.meta["min_size"][0] == weights.meta["min_size"][1] + _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"][0]) + _ovewrite_named_param(kwargs, "input_channels", weights.meta["input_channels"]) + + input_size = kwargs.pop("input_size", (224, 224)) + input_channels = kwargs.pop("input_channels", 3) + + model = MaxVit( + input_channels=input_channels, + stem_channels=stem_channels, + num_classes=num_classes, + block_channels=block_channels, + block_layers=block_layers, + stochastic_depth_prob=stochastic_depth_prob, + squeeze_ratio=squeeze_ratio, + expansion_ratio=expansion_ratio, + normalization_fn=normalization_fn, + activation_fn=activation_fn, + head_dim=head_dim, + mlp_ratio=mlp_ratio, + mlp_dropout=mlp_dropout, + attn_dropout=attn_dropout, + partition_size=partition_size, + input_size=input_size, + **kwargs, ) + if weights is not None: + model.load_state_dict(weights.get_state_dict(progress=progress)) -def max_vit_S_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, - stem_channels=64, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[96, 192, 384, 768], - block_layers=[2, 2, 5, 2], - stochastic_depth_prob=0.3, - squeeze_ratio=0.25, - expansion_ratio=4.0, - normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, - head_dim=32, - mlp_ratio=2, - mlp_dropout=0.0, - attn_dropout=0.0, - partition_size=7, - ) + return model -def max_vit_B_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, +@register_model(name="maxvit_t") +def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, **kwargs: Any) -> MaxVit: + return _maxvit( stem_channels=64, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[96, 192, 384, 768], - block_layers=[2, 6, 14, 2], - stochastic_depth_prob=0.4, - squeeze_ratio=0.25, - expansion_ratio=4.0, - normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, - head_dim=32, - mlp_ratio=2, - mlp_dropout=0.0, - attn_dropout=0.0, - partition_size=7, - ) - - -def max_vit_L_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, - stem_channels=128, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[128, 256, 512, 1024], - block_layers=[2, 6, 14, 2], - stochastic_depth_prob=0.6, + block_channels=[64, 128, 256, 512], + block_layers=[2, 2, 5, 2], + stochastic_depth_prob=0.2, squeeze_ratio=0.25, expansion_ratio=4.0, normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, + activation_fn=nn.GELU, head_dim=32, mlp_ratio=2, mlp_dropout=0.0, attn_dropout=0.0, partition_size=7, + weights=weights, + progress=progress, + **kwargs, ) From 5e8a2228b3a663620e15863d79f95cec6e9600af Mon Sep 17 00:00:00 2001 From: Ponku Date: Fri, 5 Aug 2022 20:07:46 +0100 Subject: [PATCH 03/23] Revert "rebased + addresed comments" This reverts commit c5b28398cd48d2f3403c7c8eeefbaba9df05fcfe. --- .../ModelTester.test_alexnet_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_convnext_base_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_convnext_large_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_convnext_small_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_convnext_tiny_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_densenet121_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_densenet161_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_densenet169_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_densenet201_expect.pkl | Bin 939 -> 543 bytes ...odelTester.test_efficientnet_b0_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b1_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b2_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b3_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b4_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b5_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b6_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_efficientnet_b7_expect.pkl | Bin 939 -> 939 bytes ...elTester.test_efficientnet_v2_l_expect.pkl | Bin 939 -> 939 bytes ...elTester.test_efficientnet_v2_m_expect.pkl | Bin 939 -> 939 bytes ...elTester.test_efficientnet_v2_s_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_inception_v3_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_maxvit_t_expect.pkl | Bin 939 -> 0 bytes .../ModelTester.test_mnasnet0_5_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_mnasnet0_75_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_mnasnet1_0_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_mnasnet1_3_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_mobilenet_v2_expect.pkl | Bin 939 -> 543 bytes ...lTester.test_mobilenet_v3_large_expect.pkl | Bin 939 -> 953 bytes ...lTester.test_mobilenet_v3_small_expect.pkl | Bin 939 -> 953 bytes .../ModelTester.test_regnet_x_16gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_1_6gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_x_32gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_3_2gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_400mf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_x_800mf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_x_8gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_y_16gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_1_6gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_y_32gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_3_2gf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_400mf_expect.pkl | Bin 939 -> 939 bytes ...ModelTester.test_regnet_y_800mf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_regnet_y_8gf_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_resnet101_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_resnet152_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_resnet18_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_resnet34_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_resnet50_expect.pkl | Bin 939 -> 543 bytes ...delTester.test_resnext101_32x8d_expect.pkl | Bin 939 -> 939 bytes ...delTester.test_resnext101_64x4d_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_resnext50_32x4d_expect.pkl | Bin 939 -> 543 bytes ...lTester.test_shufflenet_v2_x0_5_expect.pkl | Bin 939 -> 543 bytes ...lTester.test_shufflenet_v2_x1_0_expect.pkl | Bin 939 -> 543 bytes ...lTester.test_shufflenet_v2_x1_5_expect.pkl | Bin 939 -> 543 bytes ...lTester.test_shufflenet_v2_x2_0_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_squeezenet1_0_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_squeezenet1_1_expect.pkl | Bin 939 -> 543 bytes .../expect/ModelTester.test_swin_b_expect.pkl | Bin 939 -> 939 bytes .../expect/ModelTester.test_swin_s_expect.pkl | Bin 939 -> 939 bytes .../expect/ModelTester.test_swin_t_expect.pkl | Bin 939 -> 939 bytes .../ModelTester.test_vgg11_bn_expect.pkl | Bin 939 -> 543 bytes test/expect/ModelTester.test_vgg11_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_vgg13_bn_expect.pkl | Bin 939 -> 543 bytes test/expect/ModelTester.test_vgg13_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_vgg16_bn_expect.pkl | Bin 939 -> 543 bytes test/expect/ModelTester.test_vgg16_expect.pkl | Bin 939 -> 543 bytes .../ModelTester.test_vgg19_bn_expect.pkl | Bin 939 -> 543 bytes test/expect/ModelTester.test_vgg19_expect.pkl | Bin 939 -> 543 bytes ...delTester.test_wide_resnet101_2_expect.pkl | Bin 939 -> 939 bytes ...odelTester.test_wide_resnet50_2_expect.pkl | Bin 939 -> 543 bytes test/test_models.py | 8 + torchvision/models/__init__.py | 2 +- torchvision/models/maxvit.py | 233 ++++++++++-------- 73 files changed, 137 insertions(+), 106 deletions(-) delete mode 100644 test/expect/ModelTester.test_maxvit_t_expect.pkl diff --git a/test/expect/ModelTester.test_alexnet_expect.pkl b/test/expect/ModelTester.test_alexnet_expect.pkl index 1e3dcc17044ddcb1b344ac5c9363a13ac5d991d4..5408e33378bc56da2491bf6a4f6816899366255b 100644 GIT binary patch delta 230 zcmV(cnmoLd z7OcFmWLvr*1BN)Lgblo6_#(W(j1D{zPBFZC3+%gM>~}jbS$aE~s4Kk7?jk&Z(4aY$ z=p{QYJtMpk7>+!^rI0$U>BT$qu4p_|o{GG|1Nysr`lq{45I9l1QW?a%?;6xQRw&QB z%byaw6dNVGL>d0OVMaqdT4=nx2<51{kY6ae{<)<)h=*#s5Lp4bh$1(;T*ME&ek2RL g8BJ9>9AJPvP)i30(anmoLb z7OcFaWLvtB28KAKgblo7_#(W(j1D{!PBFZ13+%gS>~}jcS$aF1s4Kk8?jk&J(4aYp z=p{QZJtMpk7>+!&rI0$P>BT$uu4p_{o{GH81Nysm`lq{85I9l1QW?a%@fy@SRVdHA z%byaw8XG0MJ{kVIVMaqdU1+?!0OhEOV delta 230 zcmVW;9!PHC9Gpz`%S2PLY%kUYV?ubyu|q)YU?3+^Dl4Stip_=^HR99!_c gM*s=GxSnUou7|%KE;K*% z#7aNZZ3w?=W*9)g3-CQH((*na;m5p`-l4sSYvjKx)gM2AP8hqt<59nvDuF*BoPs`= z(Wt*PfQ~;Yo58;v71chviKD+p99lov!5cmz6b`@pg_A$<4%tyXrZE7&SCnc$nzgOJ z%C=ZO(wi+m!0)m=X^XbMUO@vtBm~1gQ$Pqn2r{{~$lK{+7Q5K2N_Mt|PyOmL0w-pJ6|ECE-4xye_@8J?Fi2 z)h@q)X`H|PU)??;9m+j$o6bJ{&ij=v_U>tt`J1GrZ^{~$lI{+7Q6K2N_Mt|PyDmL0w+pJ6|GCE-5Mye_@KJ?Fi4 z)h@q%X`H_WVBJ0)9m+j&o6bJ_&i@=v_U^tt`I~GrY?!`1jSUb!^_Z1D zu+XVKmM6kK7Cgj0OqWML7_L6PPmUD7ilL@Hj$*dI$*D#^0HXrC0B~hHd}9o~Jl2jr cUg&E+P)i30Ms^sMlMn*X1V(lkmXqWH$DeU(oB#j- diff --git a/test/expect/ModelTester.test_convnext_tiny_expect.pkl b/test/expect/ModelTester.test_convnext_tiny_expect.pkl index cbfc82b303d2f97da7c7798e5c5ad6405c6d3509..c6fb873f12f17656ab2f10e83328b29a0a7807aa 100644 GIT binary patch delta 230 zcmVrd_hz!35A1A)+t-wCb|0g~$>K47gJ1;&WWnsVcXD&ZB z4=%o#RSG|-Qx3i+m$kjg;Ws~!RpUOq+Zw*PnN>e}r`0{UiG95qCvU%e$~wQ&rKY~) zP2WBM;ZQy}XN*40`+&cq4o*J@pn*Srs3kuB)(<~t<8 delta 230 zcmVv>rd^hz!38A1A)>t-wCU|0g~)>K47nJ1;&XWnsVcXD&Z1 z4=%oyRSG|#Qx3lPmbJai;Ws~!RpUOy+Zw*OnN>fKr`0`piG95mCvU%f$~wQyrKY~! zP2WBe;ZQy}XN*40`+&cs4o*J@pn*Svs3kuB)(<~=hJn73k%Cdbt>)^z?3?qw(%c}v zNR=o**`-3iMmKc66nf7+US>i+CO#!TnBIoI$sn`7&S2j@*uM-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0%|$1b}V<%)eL_6P30xMTa? zRZ`n+=WY|S+qXpC{z=&ty9^Vjee5CU?fj(5_Z|Aac3;%jQ~P#TU$tAh{r*1PI~Vpg zG&Ah2FYLEH@uqU`8=n>X9N3liPSaG|w`+19-#(Qu!FD10&GrSnx3FtkcVzFa*=+mX ztM1)*<9ML$ylxe{36dgv{EsZ$Td-Qo&eQyd?Kk0EyKCGN_idTmWLNa8c^_Y=s@)I1 sP5UZtC++i5f4o=e zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5-09smtBN%#lD{XfqS>^*uK|GYP)USHX*x3OXTg(lwGk4GI84XAo#qUo>ck1 zHQ(3nGyZyN-^A*xc3s==@B4N8!rqc*hP@$${kBnWD)&C{S+Va4o6_DcO|^Y3llk_2 z_#ABKyWeb|$a@Pr({)Goo}10K@0#k~eG*gcRG+2ed<;ok7oT6U`DKWuLZ z=i2S#p15!M+$OuIXU+Rwb*S2%;oGz?`gYR3A8L>H{yOn^U)f_@yPrOH?7FpG?Vj9} z*jF#xvTqh|*gjBbHJsYD0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H z!yv$$jZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y D3lRaJ diff --git a/test/expect/ModelTester.test_densenet161_expect.pkl b/test/expect/ModelTester.test_densenet161_expect.pkl index c72d10d0a403d085d355511889ea914e0fd7560e..9b221957014f8b593b004202dafd8861c2285c3a 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5Pl*URj5UHa?__te_$c^0=X zY%9aQU7l<9Sw$$=T@qWdZ&~M~y+=ZdZ4d4#*(Y|E&n{oscmI_YuWSQmf3$gbuzg>z zO0Au@K&M^8y?r+K8$0cOmm1nNWpeD>b|LSM-Ms_8`#i;J_BypS@2hz>#ZKto=6zx= zZMJuKne0j?X4_v04zRO`PqRJqnsYx_#%Vh%F^zrerg!culMvt6BDi4R&9p^!kHqTj r?)A=xZs}_ literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66e3p}L7TK*7tGAo**Uom?`I~z$y*_W3w8P6*rCnuT#QW7YCEc9+ zt~?FgpYVWbKPa^R2u!$m0vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32c>)m~~mI~X4omAQ95o5hC z#$t<|fWry90-o*rQde5<+u_z|_sDMAK8ACAcK6S1+^4E>Z121pHd~FoPW!g1PuZ8l zvU2Z&-ShWN>{+-sBvsrt$tTE8Hs_fg17}{zzAx8r+Np<$+HFu~-Jk5oW^Z(Q!9GEi zdHaetS=y_3=-dBpliF{}FJrfAwxPY;j3fIFc`x3#&e_gB&2sL(cSmRLeXT#yz9>A+ r-q=Qc|5JazeK{S``>Xz{*~h%&*yqYM)9%xQJ^OO zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=YMU?!JRdh3$<_s_avWvEJumvBmC;{Rul)p6&Z&R$A|yM-a&S3InV6YaFpyj zcKxQ^olsG`*~+Z@ef-$$9ZxUV_eXi&zNk%>_W#`V?ccOX?f2!Ev8$bJXwNj`$iAK4 zi}y7<+u2)N&fT}*=*+zr^e5W8ho{-|+oyNVx0uLRJAFul+) z2=HcO(}60IW7dT$2PI|@fYI9_T!u+tPlCM52Fe=@o=^p#Oc3DB$_5f+211Z}h*|*W CRr8ww diff --git a/test/expect/ModelTester.test_densenet201_expect.pkl b/test/expect/ModelTester.test_densenet201_expect.pkl index 92981fdecae0e99a64f1529c19a447c08129f9b7..fd85ca178c94f86b8cfe72602de4ec1e7bdfc4f6 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32g-uitxv(M}csC={6uP}LE zMXRcP!-J(ZMf%bEwd@)86>gQbcXDL4d$ek+-J1M&`>yGP+lxH=zPEE`{=USBWZM_2 z6Yaz{KerW%oN4!FNrv6IHVHew<*)6E_hgX z+HX*8vek1Aw7c|&dH=4=4%?s6S#~)ye%j8ta@Us0chbJY>3i+YH!$v-W#4IEutjuV q?}E9u^H=}a_n}IB|9%_$eMQ%EZFQ_C?2Br9z3-OHa$9f9h5G zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=TjI{XW~-XZBfFzS;Y?AbDRqXiuuWg=&+nnscDt#y`yaw`6wMu8YpH>zeV?mg~x0 z+gCo5_RUG(YqzI?abLZCr@hw}(S4T|%(abM{bS$lD)IfhZS41@T+g)?ww|z0s_pf@ z2QtfT4J;S#1BKSB#CY`+z>orA+@ZzKU=0tgvecqtU@W*fnG+dYNFfJd8gqeczBn&E zlnH1n2nTpGf+%>JM2`f{_4yvc`Y`Q4`Qd6q3>)c!!QD*YE%;hw;{_ExaeH*ySmzbUrMqzRPv0+ff0j gL>IECEbHv3P)i30F~yPIlMn*X1Tn>t-jn14#}|)o3jhEB delta 230 zcmV`f|e52~j?ZMrEMQd6pm>)|UrgGx;eHYJ{q=cuXqFE{!OrZYU~1 zdUh!S4v?vx$-b$%-NPwxn;EDAM2;z9E#)bs^h+sajJv3lt1wZjsO#z}DPs01&^OX4 zr7}$_4t+$b78rdgev_rB7AM{)QAcK~=ujalGUtsd)zG4;F#Ad=szj2i`3Pnz&|+yS gmoh9W8Nz(3P)i30FFa$plMn*X1TQ>exs&7q$3yaDLI3~& delta 230 zcmVt3iYGx;eAYJ{q|cuXqjE{!OqZYU~Y zdUh%D4Unmh$-b$k-NPxUni;49M2;zNE#)br^h+sBjJv3lt1wZjk?ZOy9Afq<+c(lF znleo)3w=bY85n&j7?Y)_0w>-mQAcK~>`);pH0O;f+0de?LiYCy0(h$6*r6vz;BzSut8Xb0 zy6~yM-##i~WzMO}fhek@2?i=2dFZH$-A$=PgEA_El!2+#a>%JHlbNWD*!w8<3_B|R z1*)l@AqFbJ$Kxr8{3|M;Qzxe#u)C<-aKR|P()y_w=_4w50}WBC*e{5w5gv`HxUT4^ zK$X2I>V?~>R1HKZ&<6LZqKg+Q46TnS4Sx)(m&uB$0c{8>L}Z64YnA_~SBD%bJf;FF gSCxvWE$$d9P)i30a_5~!lMn*X1ajw{Mw8?M$C%M+fdBvi delta 230 zcmVYCz0(hz@*`X&o;BzVTs&6Um zx$vpQ-##j8WzMP2fhek^2?i=TdFZIB-A$=9gEA_Al!2+&a>%K8lbNV;*!w8z3_B|N z1*)l;AqFa~$Kxq}{3|LTQ75M$u)C;8aKR|T()y_)=_4v}0}WBC!Y_!aBp!{a!mjA3 zK9#*G?S(Ft1Z@Z^KxBt0NR|JnPlp^TH>LtA gTa}8aF76mAP)i30ULvlglMn*X1YRPprIX|W$J|Y5-T(jq diff --git a/test/expect/ModelTester.test_efficientnet_b3_expect.pkl b/test/expect/ModelTester.test_efficientnet_b3_expect.pkl index 0334adc46316962a0bc47fff035e5ce0cacb8050..989d6782fe799c4833239c51f08e8375a6592179 100644 GIT binary patch delta 230 zcmV{oLTLmfP%U-DVNW`fu<1;Dob)>0y1{|siXeX+G z@1Lj@43#PPN0Og9&1$S`B5_J!^#cx@V~ud~3ZswD*}(jvI1Fhjqn^CVU%;Q#X}N5j`C+k}Uyz)Y#8 ga`w0=miVKoP)i306e?e_lMn*X1QaS?v6JKi$F9_DE&u=k delta 230 zcmV{oSTLmdB%wDMMNW`fx<1;Dzb)>0#1{|sgXeX+D z@1Lj}43#PON0Oi8sjHa+{UQ@bg?CVlaVPe6dEayEFh^4=1r;e zzgnqKKn*8Mpqi;-=2a=c4-lznG)t(&ekDtr*3;GAovER z=;el~jxb}W@`dgxk!>2Ova`UcttAC1mLj;QNkhM=7$sIH;pb#N#8tBrq*<* gMiq&t=4U76XHlp*7sn`4FUu&8^aZJ;Q~;gUg=P)i30ENFdolMn*X1T1KMb(7=*$6KdpbN~PV delta 230 zcmVJTV$g4rix?1`yt;MS-D@|#g8V{?V5_laOAM>Eu@ zAoE2hJ$A|{jtB6lKQ5)K@ZcvnLiZ;eTCXRok>aPumq({% zyE!LKtPdx=?F%R2Qn#m|)_x@pNgb!Ww@4;v6Mv+@qxC1Wx}YaS_%)`ufq|zD6nH0q zK~E>_>Aa@&XTT=ZS%{}B65FTfx=E&P?(Zk{CfFvBE@vi7YS&SxC5i>7eyN?N-I)fb zf@w=9q{5yjAhs*0)L$D6ilZ2$lO delta 230 zcmVQ5)c@ZcvdLiZ;kTCXRgk>aPimq(|E zyE!LYtPdy7?F%RFQn#nH)qW-JNFAqmxJV{I34f%>qxC12x}Ybl_%)`gfq|zy6nH0Y zK~E>t>Aa>1Xuu}SS%{}V65FTjx=E&X?(ZkJCfFv_E@viEYS&Sx^N9tgh^d{X-a2zMh{8gv0&d{ce24W`qZI!21*b^sDAtNWm9PK5NfXJr= ze9NZ}Jb@?72%o1OPHU&EJNPDU?d>Kcu;`|i?4hQFj-{qZ2IWzwfFPWs9QsQq4XE6v zVIWT@9$&B~(_J*DmLB@1%sNLVeqy*KjM4@tUbB`a@8}XI&_KDT+E|bYrHlv4|(8A6=(5Q}L!C$p5D0*^;M8ftaT}Vy~rx z{wk)SOeZHrZ%n2j%T^_Sa2zMg{8gvE&d{b(24W@vZk4Br*b^sFAtNWK8|@{QfXJr` ze9Na2Jb@>-2%o1lPHU&JJNPCz?d>Kou;`}9?4hQgj-{qq2IWzwU?rTR9QsQq{-@lg zOdwAu^j@$gyInM=ZXWuk{yIk{$YZ!AbkYVVkF%C01nCkd(?GeW*I1k<%sUGvnUx18 gp6ddqkPxP%P)i30p8o|RlMn*X1fKr|B9r6-$6Oq0@Bjb+ diff --git a/test/expect/ModelTester.test_efficientnet_v2_l_expect.pkl b/test/expect/ModelTester.test_efficientnet_v2_l_expect.pkl index 6c17336090d764cafc4c4afbaf4787888358231a..a6709c7956d399bb573c7d996d65a50f56458b3c 100644 GIT binary patch delta 230 zcmVvXaJ+!3?*r#3UHDVsCwJOwnQ zW4kkW{wg#G6ZtbE{gyMyVLddoV>mP3HoP)DO4u_hCf2ml|EN*49u226ZJduYx>+|f zx#7yQ65De$UfQ5EgrkJCd3%jC#t2=srSes?3{iiyqsFK-+*X>jdJBlM)n|k>Zq4>H g#0dT~J`pgpP)i30htfvjlMn*X1c%Z_;gjS7$Ln@#DgXcg delta 230 zcmVnv61l(wi~5l~}W37o{_lMx`{6 ztco)l9WFE~ykN+Yz&hs5UdxDVsCFJOwmP zW4kjf{wg#|6Ztcv{FXDqVLddeVmLE6HoP)bO4u_1Cf2l;|EN*4HVvmUZk&%ZMp-vA z>EX(=9NTj=X4;@Mexro6bbF07tq5JTaPn2N5K(`$kH)Ap)mECc2n&d^zGs9qe9iVV g#t8m1P!KS)P)i30@V6W7lMn*X1n{>T?UUpJ$C!p^ng9R* diff --git a/test/expect/ModelTester.test_efficientnet_v2_m_expect.pkl b/test/expect/ModelTester.test_efficientnet_v2_m_expect.pkl index a2d307023f18dc8419d5e181417c3702c977a252..622e2458e669228e45cb6e41c915fb81b4786ff2 100644 GIT binary patch delta 230 zcmVe@}jX?^(ruP17k5@c&@L{4EC^42$Zp>sx7i^ zc6Tz4nuoCj9L+EkOZ2b+#|$zv=UcKEeQGi4X$G=5`fagrf8J3raL@WNeLyF&x)b;^ zi+7(G$3I^~ZtFc|MJT$cs1S_Gl73L2!a gu(2ny;DleXP)i30Wq&gXlMn*X1Z96S36tak$E`?cRsaA1 delta 230 zcmVIM%UhlEE?dnnE&mqld9BUzo8_ zS^6<-*|;%?q7Sf&439B}5sffNjF_>Q@}jXg^eQm$17k7pcCN304EC@|2$ZqYsx7jY zc6TyvnuoFY8_h5rOZ2eq#tbqL=UcK}eQGiOX$G?C`E9Xqf8J3reb4$aNYO_v3-$ONIWcp0Ry gx3MR(CWT+IP)i30M@NwwlMn*X1V=}a8k6J#$HHG}rT_o{ diff --git a/test/expect/ModelTester.test_efficientnet_v2_s_expect.pkl b/test/expect/ModelTester.test_efficientnet_v2_s_expect.pkl index 98f4ffb1ae1e549f6575a30dcd4c9416148bc1ef..ef798e3a0b03656dc7183ef0f9a36ef4d38bae13 100644 GIT binary patch delta 230 zcmV#+5 zoI5t!NnW*J@&>kyUcxoLKmfMFp;|S-K(w<0bGWty(tb9>b3rwb`K&bIwg|Nu>2x+2 z=PI_CcC59vPJgz$b^Wy=qd7M6cz`s59dI^L@4L0k?13~3Mpsd_^dTO$suV1>Rm^cV zoS0ZOkq<#NK@Fia!_bg6sdb<=*1|Y8)a>3hOgud{u*n)WC^98A&_sAP3XIV;8XzOJ ghxT~2SpP${P)i301FTpiP?ESS!M^`map}Mmq0=BmFWqUSoo;@|YDyuY%&<3?5A96Ou zX(_h05vsLriF>yFUir1rwl_9}`+PJm)oeC!%DJ`hqj@xXa8psWTAm!X#BM3IL-%er zHB(eI;2Jy#Bu0SF+%Hni3sF=<*FM=Vp gB13kyPK-dcP)i30eRsJ;lMn*X1buh8M3dwK$751qmjD0& diff --git a/test/expect/ModelTester.test_inception_v3_expect.pkl b/test/expect/ModelTester.test_inception_v3_expect.pkl index 313b2141d1792a7db19eba2b1462483f21814d66..51b95f6dcd5b51a038840aac32f89512b9451814 100644 GIT binary patch delta 230 zcmV}%0)#^+LG|E$pqX#CL@>*~?`-!jr{aurdYL8nnj z#(>g>t<2F}Zvj!RjQmjmH(O7z2p>-X)P_>0Wra6Qxq^ttnC@B8pN8E#Ofu zg!NGf7a>xTw1ZL*LIl!I0m{*2B7su+eKgYQ0KifwP(D&FsWeejDF^D&qH2OtxaQB$ z@)W94U|qS<8PS2#2K32MOt_`cSji4ij7PiCq^}ZEpz#{f6sOSA*&MM^c;^pM#J8-` g>;x!NZ=_h#P)i30P;QrSlMn*X1W<05aFgT$$KZi&Hvj+t delta 230 zcmV}%0}#^+LA|E$p)X#CM2>*~?`-!jrzaurdmL8nnr z#(>hOt<2G8Zvj!ljQmm1Hd{|X2p>-X)P_=rf6QxqxTw1ZL+LIl!K0m{*AB7sr@el*hO0Kif!P(D&EsWeejGY9I?s%nB#qUO)g z^AxI5PF=at9MOT&7xc+dSh%IoP{|HarboNcpsx~A*6|wA9H-FI)*P`>isuhe#J8-` g@dPMSbfj3)P)i30(dy%LlMn*X1kvi_bd%%)$2LoF3IG5A diff --git a/test/expect/ModelTester.test_maxvit_t_expect.pkl b/test/expect/ModelTester.test_maxvit_t_expect.pkl deleted file mode 100644 index 06224d7ad33d19b768c8b058f6887886f777a6c0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66cPzfb9<154(Pb_V0Ocy4Yr0U9Qb13&Fii)A;tbMg6d;>`S&WH$Jl`kiWua zPut-={Ax37c50jLS@A&EhN))do;UZe?U7n0Xyb5--Nqrr)N(@RY@4^I^ldKinc5gm zW!{_feA(`fldZd@)~>W!aDTgX!tb8lTSBMXZ0tC>hih(|jmB+J+kFuRdrj(ecblu; zw`TlRYMrtvdQVBTs`U^1B-`@!9eaFUoU(q9d(C>qe@UA&Gp|@lorA+@ZzKU=0tgvecqtU@W*fnG+dYNFfJd8gqeczBn&E zlnH1n2nTpGf+%>JM2 CD)+Gf diff --git a/test/expect/ModelTester.test_mnasnet0_5_expect.pkl b/test/expect/ModelTester.test_mnasnet0_5_expect.pkl index 8c53b303c7d6086ebdc28a1116499b4482c8d81e..aaa969d60c0a9a2112e17ea1d2b52d8c0240866b 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$(x|eDB<~=5RGIp3~-MPFa z$a=2HyMSq%AKX+odBjnvI_nIod>pT7n3z%2%Nu(_nqXHBP;z(M0NZPwo-)~_ylV>pMu(WL literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+_sL%QSEE9+NE@J4__+T;AetJ=f$$z_iUbZmOFc=cwG0m3GeL?On&MF-{U& zlk8q^E>AjUvLbxZ<{4`lwiN8yxTQdB;--I9t4-$HX`AfnGd7;{F@Lkjkqbt1LzuT- zJfM>=H#erL;o^IQcz#Fxhp=15}UmTiSA}(z+Vi%V*F+0+^MI`(DmI8;m z%{-o`OtvZS+5!r#OI*FKCx9UZ!ni|=pTQa)T4kw4#lTo_b229~xR62)!ZhXr*?e(c zdMFdnRuB&GW&~02G>IIC0w4(#fSy9pbtC(U4@KuIAP-r$z5%*kWLNQ{=#>Dv5T+Lz z1_9n|Y&uXya?HAL<)Fk20x)_zgv&4q>`9P!*+6-N!4s+glnDa7S=m5h%s>cI4^ayM DqB{3j diff --git a/test/expect/ModelTester.test_mnasnet0_75_expect.pkl b/test/expect/ModelTester.test_mnasnet0_75_expect.pkl index af9c22131d4d2ca6ecbb0127efe27e77d7692aab..44213104b523ffe909bd1ed7f624dd72457d0c3b 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5Pdxsx|BOl96+yq4(KMMq-cNZgM*iYs3({&&X%ir%e~6*bVuTI}M}C1ULQR4>Df$@zzET z{i7R$b2*F?^1p0~%(-V25EZ_$JjTTM#oRop%~MYJZ(3(}W7D16=9}jmb{j2kGcvYb zpt^aRW7DQAA$Y`OUt+e4!bZ@w68ZRp+%05-^+FaQ7m literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66e$0$(t@rVcuZ9V3N^_)U%rwFJar|FhobWpkcX^W-vC`Nva9$}^hy9-2-6D< zg8*+fHXW!UIc8nBa!_Ih0T{g-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0#kJ#HHpFn>0@sWx}Rh6g7%BuReImlBIG6c!NK q_#jMvV?mm<{tO;=1C`wZ>tC0t819(gwQ)g-hoR-#Ck85WOpE}s1C zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+|ooXoKebWJ9@Ex<(E6ST{CJ6f!6~6}{2z(CrO3ZfoiP2}$1Y<=~EuHJPa! z63h9FEXuMs?3=)`X_MZ3!x_&CjHa1<-`H68X@kEin~{&>p^d-P)^E7*^OZr}2ZK$m zg3GKtMJRDF2VF!=)eMuoN|hNgSmHcn;! zZ1_fP?uIoFPHs?;{GiV!7GWqPAhPjFnEb{EDbo7ac-Rdhb_=ZkQKn+JZ+_RtnI#^E zHfx_4D9kZ20)>|JDeGA$fFT9KxI>Gd!5SV~WvNBQz*ul|GAA;)kU|c^H0A=?d~sfS zC=<|D5DxHW1X1ubi5!OlAPE$JooXBrWYCp z0p4tEI#5M&%(`&ppu`LUFnT+L%P-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$w;JkK_K zB0qoq`}zfj=k`uBJd!1`an|RP8_FJq8FmF7HeBh-yD>=ol|j4La)bOcrW?cmo;J{b zuetGG=7o(@92yLNJ-0VFGl$u*>~!zO3$A(W>vNy-Z}=K^!0<}_gpF~cOEyNdl^b|D zIBwj%h||#BlVQWoquULIW9M)98a>I7rS8UtqRg2a8zh_7mYx1!uzuYugDdKDH&`qQ sF}(Em&_)K!0>h}sPYsq8*c;9@7BPIzb#6m}ZLi*)bD0}Xy!>VW08FQzu>b%7 literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~oRBW<#zk(?%bSBO865XB&QypTGV?{Q|=id#4#5%aYhQ?eobEMUTP^JA)1z zu5jhum?Qqmpu=mqLCP7^jh25;8~D7}-1sB&!p1&_2E!N6?F~-NVK%Hd-Mi7ijeUK^ zQ~nL#!VVZ-ub;3nNp#7^n6`3*Xa~oQn-_5!ntC#9*mZQf;m?@)8$L!)GJIQmV?$Bq z%#93UO=}ZQe=yjz?v=p@^|>3=mV_9d`g>^O4~qiBh{sP2mK4|<_8E&9zUDf&;e~Cl z-rjSW8y>yIIC0w4(#fSy9pbtC(U4@KuIAP-r$J}?YWZANw#KZ;%npbKGo zp-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5Q8zJ9$KA1rm}MtiJd2#MA2 zNnWz9;qBfvFRV25e5${#UBIz^P1>Ipy^LLf`obj>)~P({)f3xyU9ZO@Vci$u;&p0v zUhAjF&04p>)oT5>AKmK(nh)t(-d?8X!=A^jSF}@FPb^DO-{ixcweh1qfxJkfmN4@M_(^5T5nz1o^-wN pupqt3`+D>^4%}at?&7vKr}y((;f0xc(hSGf6|62>+aIpA9su$|l5hY3 literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66bVZzaGa2OWl@ek98k|WA&#cFIl(Y-QG1ftTgl-s=uvm;9S4P>rabbx2TN+KA=*x3)(KMSCKq(NOAXP8Jz{ikSl$?=S zma3nUSdyrZrp3*Piy;aQF7Ror=}k}3vnsoy$Nf)!U9o@0I-BSP>$3lQ>CN!7(6c&v zV_nJe?Q1W8S6T0K#B^QQ+?nh8xPPy0NYU3aXvkcrdKV)}3^F>AJaU;rfo67uIEPHS5JcU9>Ku<%5pFGud@E`h0qa zG*+yeviXkgC(FupAwsOWYQLB11$eV_+`0DP*<=BxNu2N?nY^0G)Cweg0vI?TECa*{ v@B_%i7gm#bm{YBeLe)=Vf-nNS+1Np>aApXf5l97iv$BDB%s>cI4^ayMBioj; delta 375 zcmdnVzM6eP4=W=B1B1cD`Mx|%76)o(0XZN%K}w8|L5m@=C^;juELA@xu_RI7z|BdG zVR9goT73tv#+oVVDSBFESM=`xm0wropRvv)dcnGa|6Y0xeinL$M{lfKvV8m6bKh0g z=N~a$7c_U~x)AQ)YZ(glwd5Nz*EVIgu8r1wptt$0*}8XMB-U?eJGHLF@b9{~t*&}U z9}DZ5HJn)|QdOxpDc4DF%k&Ce35D}|?;iTBJ;1qGCt(gxpk9Mf)4Cq71A1w4)RcS>W$x?P*^=zg)R zT&E?(s;l^WnO=Z5J4b5H%8tnbOp`d_{-3;>$y5gwy$k~%Lt;^K zMrK*6eoA6VqCT1yHzzKJC^*R97QIgS?yvP>J6ElbJHnuUW%r}?yH>NVFWdBF-P>;s z`iE*Ju4i7YZ!~*>{<2L`>z8|9)|ZFsE7_g{qYrbtv$2C%;mi;|BajO4W@Q8Mn1K+a9-Q>*p=Gz3$VARr)ol8J|ISbS3o9ez)Y56 iPG#Iaxq(?+$&(qP5EQBb-mGjO9y1Vb2hx+6Ks*3fNsW>K diff --git a/test/expect/ModelTester.test_regnet_x_16gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_16gf_expect.pkl index 03135b65539c0ddc5a3cb0a98e53dc5af5024b64..e4c3f220ece614658622f742d4f265c94cc0d7e4 100644 GIT binary patch delta 230 zcmVQ?^AtV;pbAkw6TG{<-WvWqq*o`s z^DzLuF4XkBH}|ByK;7a!sJU%E*hbPk)Gow5KUal41!8_Z7RuQ@d1LZDd}#DN4wn$U g(L8v*k1XsxP)i30$tER7lMn*X1j!~PN0a0N$B-9llK=n! delta 230 zcmVh`3)_}t<>MY(N02uIR9lP<(P9#@4uzF~en%gNb29Aol5DQNUP1D6oJ gfIN7v_Dq%UC>HBb_{My{S8z#umNF zekHxZm_0lo4Rkzmy;(ixhnl?m4kNt)k7c_OcjLRyKB2r{V8u~9v}&O0VL)|Ls)0Denusi13QL g!c;)K#h?5;P)i30XrtjOlMn*X1Zbn-DwE^_#{-3L0RR91 delta 230 zcmVZZ5s- ze=v_DW%UC=~Bb_`ny{S8l#umN0 zekHw|m_0lW4RkzSy;(ihhnl?X4kNwzj%B+*cjLRjKB2rkV8u~9m};>+Fj6JD^mE%g zNe~~s*#?k2QG&F*TmqE4bQ>Z)aDlG83b#}|u$h-UlZh}rHm~Bm=RykGG0AWQ02WgE3G}+TEaX)T3Nq_( zUAnu#>N-8pZ+AR_ma4o`hax_cc9pzKx!pb9!bLs1PlUZRx)@PC$fTpZRugNzs^+Ub z2L}+n#HrIgw%oEjD%2A`X}o~F=`FepP)i30RnH$jlMn*X1Xa%;K9l4E#|tQMO8@`> delta 230 zcmVRxfGG09qQ02YkDy=wCS)={P-L zTe`bi>N-8lZ+AR(ma4oHhax_Vc9p#7xZOQ-!bLroPlUbExfoGBUZkVE6ccN`Smvue z_688WYN^vbM%=PI>eCZGH@twoPVm({0HQ-XW(DCrN^0l4`iKy|^QI!cn6t^erDs*W gm`BAtzDhp5P)i30VFkC7lMn*X1YrfYlau5E$BsF1DF6Tf diff --git a/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl index 894ec28c17bf3915ffb1242567a249492cd77b0e..bd745ed1f0558bee8b4293ab34291e2dcbbec917 100644 GIT binary patch delta 230 zcmV#I}3>7@;Z$7=TKZ(6UY)-w+UH!fK5Dh+O+UUKMo7BB;7!$s#J&3>7@+Z$7=7KZ(6AY)-wJUH!e%5Dh*s+UUJ-o7BBh7!$sQK|Z}C zQ;Iwp0kb`mm`=TDg_Avs!?eAudLz7P9hx(O0@C#AB gdWFcmcB^VVP)i30Y+*c{lMn*X1Z-hEoRj1N$20tFu>b%7 diff --git a/test/expect/ModelTester.test_regnet_x_400mf_expect.pkl b/test/expect/ModelTester.test_regnet_x_400mf_expect.pkl index b53d27243c769b4b15bf4c21f19af2684c0a4a3e..f3488deb2876e7ec867a94b833abb472eef9185e 100644 GIT binary patch delta 230 zcmVDFPy!<%%3}$@rJ#moLIcY`6@id zj&MBj1O>fu-IO~aR1rN;g9W`DXTvL)ukAdL$uB(G4q?2kcw)U?l+?R1 zpJqKt`V+n7pCmmPplH0@3uHVn!;ZUC(wDu)t0cW0$QC>hy%SNrKg_bcO`oJZnt2Aj zuAxdjTV75)DG&}l3jYAT!NqMnq?bIrvfd-IO~FR1rNvg9W_;XTv-|2SmF0r+mCNukAcy$uB(O4q?2Gcw)U6l+?Q# zpJqKT`V+m_pCmm%plH0x3uHVz!;ZU2(wDuqt0cV+$QC^Qyc1Eq8_cr2DW9Y~l6eNb zlA%gHNM24n9uN*a{{8^Hmc?y6!k0X}rw8LaOgsiX=9O=}MCEq8>;scLLTKx}FGUYL gc=RE>mf7CCP)i309;MhclMn*X1RkZ>G?U~4$GA~!MF0Q* diff --git a/test/expect/ModelTester.test_regnet_x_800mf_expect.pkl b/test/expect/ModelTester.test_regnet_x_800mf_expect.pkl index 153c8278c5c379c1baa30aee802f28bacb2cf63c..efb71925b286e5f2016eeb57678bf4ca4ba32d5a 100644 GIT binary patch delta 230 zcmV0_6R$YB?rAmirYF`ln1-iqJ_K&c?~_vDj~c{u4lYM z*|ogy-he!h-jKYz7#}>tc%i()5xYDz=s-Mj=fHjcbr&sn_wVNX0GiVi%oKG-~X zl?1zNf`mN$B!)Ub)3dvDwbZ*d2#UIs_p!X+sSCXdN7K9$)qznwXrKJN?rO_C|6s1X za7m0jMdA57R_CHTr3BWybnKoyy#|RqTD>AY`u2)Ei$goRfjQK>SLv3#Q1SA-yKp4E gwLT2IV4ATyP)i30D3#t|lMn*X1Spl>V3Xtm$B{X3c>n+a delta 230 zcmV2_6R$MB?rBhiQ77Gln1-WqJ_Nrcnv+mDj~c!u4lY7 z*|ogm-he!W-jKYW7#}>cc%i(i5xYDb=s-MlK>j@OHIBSg&sn_RVNX2%i4HuLKG-~X zl?1z3f`mNaB!)Uj)3dugwbZ*<2#UIa_p!XxsSCaRM$^0n)qznwQJ?(0(Q3;)?_jRH zXi1DbGU53rg+@dcM9uY4AOafto&;4R5`;W6HhZKPo;5cNo3|#}Yn3r8B&bvz5NhLfk%TdJ(^+ zD}z4lXSKd*cxgX2DLKCUEE2!j;c~vf&#J!ET5Y}$^GiN+IzCZ82hRJxL9Hmi{X026 zr0=-AMBX4jpsu$(QqBE6*pMVYBV80ez*bJaNWa6rFy|0I3N&Lrc8*`aJUs`#7Lp0R gI}_f%^tjnQP)i30-yL+`lMn*X1m7KW-jn14#~4^{%>V!Z delta 230 zcmVrg+SdcMB4Xz)E$fto(B4R5`uW6HgmKPo=qb{M|E#u7eIr8B%>vz5MVLfk$&dJ(^H zD}z3!XSKc-cxgWtDLKB-EE2zt;c~u+&#JzVT5Z0?@=HEXIzCZ8w9Wgz5UnV`!aF%W z0PwiH3f>?;Fs`>e!OZyzXH$KyL|zW@LL diff --git a/test/expect/ModelTester.test_regnet_y_16gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_16gf_expect.pkl index d92adad9d05d5aef4478a73dac04d4ba6e8859bb..520452985badad04866e7e062228bf00336ebe34 100644 GIT binary patch delta 230 zcmVbPO7@Imp8o1T@kz# zYf(I)ZLB+B)%LqUyQjGTgowAO%t*P0Y)U+&XQ({&y2QIUP;^l{eolisX44zIbG{wC zss^Mx+OIpjLX2uWMuY{s!fb6k5ZHve08YL)1|xO6UOSk(Fpdg5Do}|#Rx$KD?P0IG gFheQ4CDE`uP)i30MK?mU~jwfed#(hw0FDJmXww_KalReA zj|QYWystaFC5&o3nuG0MW2JP)i30G7XutlMn*X1Tqbovy7mOXWLUGy*&tIEK8ZRkk`!L;E{OUZp&7jxapaT#>vuh?2XV z4T!u$y`8&&%;Y$UCObU#M(4XQ&#AkbD)hTmfKNOB)}T9|@)=P)lV2yiDSCK3Dby^x zcYfVF!BreOG4ZK8Wa9+9&M&aL7_(PAy;^@fQq23iR`QCv6Ux84O|on}|Be(q{=mXK gq}p3MA65puP)i30V~Q_8lMn*X1Y?RXK$GMG$GC-Sp8x;= delta 230 zcmV7HOXWLFGy*&lIEK7nRkk`pL;E{8UZp%+jxapQT#>v%h?2XM z4T!u!y`8&2(d0OICObU$M(4Xa&#AkYD)hSufKNN?)}T9w@)=P)hF>SV6?%9)D%32y zTYlX;Kvf(%BJrs^Y~uvH$S<(F{IXX(qgsDFMa=uVAo7a30?NO;EwXGp<&G3Q>cGN0 gjM`f}Ayx*wP)i30(!Bn(lMn*X1k$|zw3Fll$97;N7KC4h#9?FE`q#-4k^2)LXEoSO6fcEQ4mo)aGRMs!}Ah526~OW zk(!pg@!V~^X`Uavv0cABfHxF6TuvN4WlpZU@YB#b64wnpmSJ$bK+D28j;IN|$Zy;{ gTlS_q(5EZAP)i30)TE0ZlMn*X1k|L99+Tt($5SqH4*&oF delta 230 zcmV8t5>_aZBxAOXTG~xx}iIYG%`H? zpRPQP+sZq)h>pAjetkSBU0J-EOu)JhdAd9!V>P^c_>jDM9UHuoCIURfm({z{$um3E zXp*~OEN{D_V#+-BT7*1OEg?J99K^eoeyX|vM58uDkR3WW(all3C1VP_YTE!k%+UZl z;HmDrcTg5QBa;|Bpr_kiSo4_b&i&MMqH gZYlu02-@DeP)i30A`p$AlMn*X1R@ZPpOfSQ$D@pDq5uE@ delta 230 zcmV_IZBxA1XTG~cx}iIAG%`H# zpRPP|+sZq#h>pDQeSJI}U0J+ROu)MRc)C0uV>P^X_>jD89UHugCIURXm({z#$um2> zXp*~TEN{DxV#+-2T7*0xEg?I|9K^eLeyX|v{Gv95j~zND(all3Bx4G^WZM8ew9x=N z(5ddbXHXVA36mH+pr_t9g4i?YpS~28GXBB4yil@Lo__u9g4i_YpS}`8GXB44yio+LNq+yb^tqaYiql_ zT}Hgp!p}O+6OlU~NUFQ=6Cyh%I2gRGn5a9MI6=E;0xi5u(-69r3fH`tX+yl-y$?Ii z3M{uDdnK$x>SF8~9)x)_Q)Fn^u9+<}ifPBGg%LK%`ga3wyxXxi*LCJ|w~{KN>n gkR?hwxIP)YP)i30vbj1UlMn*X1hTn0B9r6-#~hMmU;qFB diff --git a/test/expect/ModelTester.test_regnet_y_800mf_expect.pkl b/test/expect/ModelTester.test_regnet_y_800mf_expect.pkl index 419e97551631fdecec45822dc0178c7ffde47cae..a27eefdfd299fc38dad0b600cdb2d35effa3d9e8 100644 GIT binary patch delta 230 zcmVb3hi(?5*6M|oDlTa} g$kVYqM?^imP)i30C^eu)lMn*X1SmD2Mw8?M$HUWRW&i*H delta 230 zcmV6<^TWy diff --git a/test/expect/ModelTester.test_regnet_y_8gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_8gf_expect.pkl index 39817c8740d47792a631244f65b1fc275f428a86..aa12345882f7c6eb2a489240d108af1717fef416 100644 GIT binary patch delta 230 zcmVZkHsy;l!wGg{gOGZ3de9}9ZX!1N>sDQj) zUdlUP=$t&(!#q4+qS?F``lGxJbu&B~VO>1z;|@F5C>p$Io(H^v5_Y`X!i_tE$ymHE z3Ew<5{fIm)tbsfU>505Zmw`L8f{8rK=?gqy&2l)(VicS_rC1NV gqojkpcV!klP)i30nZjwsy;l}wGg`?OGZ39e9}95X!1N-sDQjb zUdlU5=$t&r!#q4nqS?Fv`lGz{bTd5nU|l@);|@E^C>p$5o(H^h5_Y`4!i_t5$ymG? z3Ew;x{fImwtbsiL=!v{Kmw`K;f{8r3=?gqS)^a$`jwieqyBSfv9wrbx7CBM8y0`E< z#xxkatn`~Z3R^@w35^3h1Qa|wnvaaT$h?<4nD@;)NbyfQkh4oVUvT%lO%$9wpI8sP goTP)iU}Y9OP)i30>u7jQlMn*X1nX#cO_SsT$BHj$O#lD@ diff --git a/test/expect/ModelTester.test_resnet101_expect.pkl b/test/expect/ModelTester.test_resnet101_expect.pkl index 171e47a488857d3eb3f268fa5f40460ac93cd83e..284aa92b2df6662c384cb6849a92d042254deacc 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5QazyDmXT3k8S7iaCtXH)Ig zaAN<_B@wHRzF}l{D~(cdt2%hoZNoiV*JWlp$Be$taBK3s=gP4#=4i^fTdtuM$B)(p z_a5t;-Q*T`S;19nxzbUYcK>5~*M1&t*_$`rF>Yy;%aY7PN3X>39NYY% zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66eaFf39p6SB|ZTwRV-Zu6A2@eE-om;j508F|xZoid1o%ci^U*(mh+(Kc+gz z7JQlEHs9l(tHy$uqYKX7axE=Ce)L07?=gYdO>Xs<6kN-eDIN81^FJ1P_26TpxHVcem`&tMG?t+LdjVqh$|Ihhj~Tu31YVH$IRY`!=z zJ(LM(D+mX8GlD31nnaF60gwa=Ku@9Qx{>|FhobWpkcX^W-vC`Nva9$}^hy9-2-6D< zg8*+fHXW!UIc8nBa!_Ih0T{g-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu33Ik5(^%R4&hgpR2vE*V%b( zGk)Zy@v7hJ()?E^GoJB#b$qQlZFSq=l+(&Dr_}x}^0KJ^ed?K#z4!Fx8&3s>czYyG znCV%!D(dvD9ji}Iaol%$kL~+T?R54>_R}mJ%TJe_TzKk? zt%*#A)y>I7Fl{Ye-mQay6vv1p!Q@^gV sduK9myOW^aM1pHI%X&FQW5gzvPOWsUdg`7Wn(S`T{x0G27Ay8r+H literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~t&PtJl6{F3++btGy4@+IiI*J@QI>+3!^}@72lePx-x!zf_%$yJc|d*77f> z4E`?iT3h@3)FnlG@1kWJPb~=W_OR)n>FK&M>hz=St54TD>^uEM=HAIeYo>UedBE-c ziIK%y>9pGE86VhBi?c63-F;%=sYfa7JAKi- z#(UK~m(y-7hrK|d)h->Cd;%C!AdEY-_!+F>p;eYzR1Ay-Hz#ul>i!MRpZGie3qz3t@Vp zVG!WW#-;;RB*&}^R}M-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32?6@$IQZdUtK-K};<&!yY- z8Ts$eG)Uc7?w_)M!W>6COUD)VI&)^*?M~*l7iR9V%k5igf7mj>e${+&d-3Guw$pZ` z*nOPy(5|&N-*#8R&%GO$E!}_9Qox=qGSAN5%-qJ#Z-LT&Z|yR>#x0NS_QkT<<^5>4 zKU>yme`)8seIByQ_Wv|zv~S>CYgg8}ecuE=9ox0;v+Z6USJ}Vv^(Fgb*QVLEXh`p$ pD7tl@`Lh=L zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=X;|!CrnhtNpU>R=d7)>2`5O{`-{;Quo>Ur|d7A<7g+~xWb-k&TPAR$=vn~ z%w2X~eM{{-ECcN8=8M~ZNm_23wj;%E*PMrT3wrWxzr_FC+qP`!{$-W|_D>@0>|dJM z*cmQR+Rvk1W@oeIvE8azHap)R?e-hWI_=l&T(?hAcG>DF+eOLn?Dwcyy)QOf+TJHW zeea6IJ$9hbikNJ%@&quXKp1yu@iSP%L#r&cs2CUvZcgSz1{YGuL72u|Ae%4FOAloN z+6uw}-i#m$o+gpwPyi%>0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H!yv$$ zjZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y9?k5p diff --git a/test/expect/ModelTester.test_resnet34_expect.pkl b/test/expect/ModelTester.test_resnet34_expect.pkl index e6812c0c2e9b4c49c9405a891b6a7b130c3e432c..536075bd40e1b71325b1f904dc171de5aa3c5495 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu33y<0bZW?);9DHsJ>kxdl3C zPTaQNa-qh7a+%$Z8ikt<*oWvJjA)+dkgb>Mpts}yfhQ~K4tzaT>iBAAy+fS-EQj-p zTO3#&W;^b@a_?aAn=c1{elKx+`?%|%Sasf3$M}ZZ_Bl0*2P>B!IPicw?Vxc5`@z7u z+y`gxl6IVz$$jYD+m?eb7x5qbdNBWhm}S;MUk7zZn=4ZtWnASB3S>Gty!a;NcvgIe ry-h)aqf$q^Bl~$S$EiDc4*i>=dEnt^g##AbjyZHi@E`OFx84r`lIfep literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=Zax5_@ZRe#bf1;RoZ~0v!%a*tXwdp~eAKnca>@3pO426s&*nThl}bL%mdo zb=&_RSh1q+K*_06N1K`T4i5UW9EumWIBd3`?dWvn-a)oEUk)~WFLB)Txa;7)s;!O_ z8gAS3)F>V_SbpF@3wPSVN9F7X?dEbHblD~Cn2^bRsO4?T!PSfS53W6!e_)P9)Vkvn)Q!^vUBHz~(F@g4T(^Aj9jwYNKNKF8%4yp!ip#1zc~U7r;WyxMxq z!9RlkU{!?meo$z=>0hLH0vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5PrZdb>WmopD8c^K%B+>vtd zWw^jWxd+7#zg*KCxDwy)Pvl>FP)Y2ZJ>vmo$FQn%jvFJcI|?}E9Z1x+cF=jc#UZAf z<>0g4jDzdWDLVeV(d4MYWx0Rj1WCvGRe3Uw>$=Z4m^d7D{8sAau;<#*1BUCl92Z5V z@86oC=-5=udhl zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+|?Q)lv85%!8H>10BA!ryQISE^x5+ezC&?*E9#2gtz+z_}3o1A$rdKzgAAj?dX9w>$n_OM5gc0$WU|)DrP;nBIA^U+Uk`D&rMwFc>L+B1K-?F z9`M&&a?ma>;^2*~^$sSyx(EFh-*T9_be)65_TBpqI83zLxi9MA_PvuF1zy=4WZt;= z;1t8oga6-FI)FmU=3ME;6TpxHVcem`&tMG?t+LdjVqh$|Ihhj~Tu31YVH$IRY`!=z zJ(LM(D+mX8GlD31nnaF60gwa=Ku@9Qx{>|FhobWpkcX^W-vC`Nva9$}^hy9-2-6D< zg8*+fHXW!UIc8nBa!_Ih0T{ga diff --git a/test/expect/ModelTester.test_resnext101_32x8d_expect.pkl b/test/expect/ModelTester.test_resnext101_32x8d_expect.pkl index 4849c8ff061f1a19db0940189f9e5f3648e7e0cc..f8ea2c0e67c4ddefb2ff737d34fbb8f52c7e0422 100644 GIT binary patch delta 230 zcmV*H}H}X+D$UnKcG(ru$zQ+JP z`~yZk@EKS=SYRH#E{;9D1*fb$0+>?0+K!YwviP7pgL^}{yM$D_9eY;sL?z%_$s}{;IKS`G>*HrH}X+DKsvd(96}AeqQ?L| z)B{F6)frelO<*3q5{^B+?xw6f)|XPf*N&7ti1?s9U|hL8e(h$w2NFHJYD#)L8NT&A gz_F)0C=~HKP)i30d*N2(lMn*X1bg9D6tHC@}5fM9uTy#7H2{XO1L%+OQp!vL9 zF%&)Vp`N^6u(rJS8Rk51cJI6h*q^%e>Fv6&-2^?*s0%z+4#qq<*|I#h?SQ;KH?6y3 zb1OYqTC+T<_DVg_hnqcunDsk4l2yF=*Ezf^^7*`ui>SOnf!9&G$OfD`A%a&t1o51_ zSJzcMy_2xKAXGRy@EM4^=V&rKI*g6H#OK$#Y~=$y2;pbD$}md2f#CN%d`ek8?C>Ew g5Q6}_ZFjl6P)i30q(InalMn*X1f)RNW|QOs$Kefgm;e9( delta 230 zcmVFv6x-2^?qs0%zu4#qqs*|I#q?SQ;QH?6x) zb1OYgTC+Tn_DVgyhnqbZnDsj+l2yF8*Ezfd^7*`Wi>SOvf!9&Ghz6WG5`tGe^6;Fz z6xUTev6Ha8a8Ecoz!`|Uu4pnmDU6N0z~|SxGUWq2fZ%7lZZJx`XyEreVoF&(Jn$hq g*nP)i30BgMG?lMn*X1S7?`|C8hb$2ib*CjbBd diff --git a/test/expect/ModelTester.test_resnext50_32x4d_expect.pkl b/test/expect/ModelTester.test_resnext50_32x4d_expect.pkl index 3f2b0911c22afb2020a7ec2ac59cc4a7a6468c04..4e5dc89c496e36ed21836dad9cad31e772ca13d4 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5P@rA@YJPjhXqcvSB>!{}vw zF1OQW+dd83$@A9iIltC?uZX_Fo`k}$)(m>lduCl%uvtFg|DFehVYcfoPqV4J9%t)R z`)Kv&ZzXg3Yg`4c4VD$M#IS zc;Ck4%B?+@%!T&wcR$~AtSihW_}N99d%q@G%f)E#VF+v8G0W(rwbjmF)`zCZ+ZKEi s*yZLIW@EBpp7rn79X4lXsPFBlFy4LYUGyHVl9e`hmULN9IJ;{P0E$MU3IG5A literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66fgBCfkIkxi%|2s`r>MdRbq}?X;P*Ps6rg-kLo-)|&4X(O1|LQux*Sk#6*! z`PUU}7EJiR=Uic!?T*XSYznT&+3ME5+w(!PWlx4fp0!Li``+5qhI{9vPuLsn(q*$` z$(LPq_jC4`A6BqAu(ZKC$K}|b){FOTB(B`rGsRqJ&#$iMd-isP*|{K9M=teIIC0w4(#fSy9pbtC(U4@KuIAP-r$z5%*kWLNQ{=#>Dv5T+Lz z1_9n|Y&uXya?HAL<)Fk20x)_zgv&4q>`9P!*+6-N!4s+glnDa7S=m5h%s>cI4^ayM DWq|$t diff --git a/test/expect/ModelTester.test_shufflenet_v2_x0_5_expect.pkl b/test/expect/ModelTester.test_shufflenet_v2_x0_5_expect.pkl index a220e2fe1ea36ffa9e084083477c61f6b23cdcf6..72af3d712ededeb26ac1feef865fecb6b8dde7e8 100644 GIT binary patch delta 378 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CIN#c>fho3-Ap)CP#zZ|Mpb;Rj{p#*L*+xtD96Qy#XW8Y6 zHfGK1-K~}TcSYH( zvXR+)#5&;k2b&f%X{*{IUB5o5`Ng-+T7#?qu83@`z#2w6pv+Ci7PAIehW$ yo}hJq_S{~?X=7Q~wde5cW}CTQQr0&o-({2*SbxQ9w;|JO8=+G(_C)R3Y7GFEPKt*B literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@-9>-Fvc)mh3rps(H_{%M)$Pzkjp&DAQ~8+QW0V=!ehNTNi(}k!?R`-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_hu zH#ajVWCrU>E+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu31*N})}1>rI>-uiQ!gyqqp6i^Em85##?(#EW`kJo?#T?3 s_H6!SVpBKo@b0ui2AeN4D{Owu*tEw*^xB^MXB%x;+spRwty{eZ0Q{w(B>(^b literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK62~E>&?dR{rp;WJ@7A*__w5c3*0WIzi`=a!Vrp&Y-?Dr8_Ukq+64&-zIB&AY z;q3Z7DRR5_eAb(>XV;Ba*0WPi+B9G8**(*~+GfVnP1YK_`)o=k-M88H|DVl5wq1K1 zRA1YiUf*J~*87MJ3s?D`&v6I#yf}H>ra&)u_q<)_tO~9w@A3ZAW8m^u%I>Kb z7VXj0%C#1+cxPkgrf0K3tatZhhDm!ie=@PDpLck7dLe_&mzfndKW1#&<1Bh@Pu{bQ zHmvPsd-&F^-UAA)Yp#9sP5?s+gmH%!KZ7+qw8~P8ih;4<=44J}a3O^pglWtLviah? z^iU?CtsorW%?P64X%aaO1waxg06m4G>qhnyABxUbKpwJgeFJp8$gbi?(JKLTAxtkc z3-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0%|I{!Ud^)-7$IcC`~wY%(@ zdHu;Aqa5`;$Ffb6_@1IQi)?=6?XtF+FvTV; zWal3BmY;h9nI`X1m5<$1JNY1^G-KZ6yNu!@{Cz)cjNVq*_(fjWE%tSib%WOeYXIZ1 BhrIv* literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~q5d{~pcynmuA1vuqgKUG~hp{$!6)j{2TsTIXzJHgZ{gwft{YdFaHRYrzMs z4Oks)%yn1pKIpP#&(<%6RwZXH>{)tp-ku+8FWcxa@a?KkerByxd~Z*RsFRKVi9@@s zr`g*axO{w1$(ltrKl653+f0~Z6CSd2k9y0`J%LP<_o&Fn?y37YbC3D{Pd4_i=I=Q* zy>-u}ZToCeljiJMlzn(lZs~$u6;tc?q!^yE(tFxwHLYsl9&g)kd-(c(*ciR7u0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H z!yv$$jZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y DJ!=0l diff --git a/test/expect/ModelTester.test_shufflenet_v2_x2_0_expect.pkl b/test/expect/ModelTester.test_shufflenet_v2_x2_0_expect.pkl index d2ef01fc6a5ef664aba43a8cbb26b6bb216ffe3e..1af1049efbce650a979a3776d0e2e007fe0eb903 100644 GIT binary patch delta 390 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5O74R;$`jmkai)+y|kjE}JS z;k|Ot+Kp^`7O5<<+b`rX~VEZ%s}^y-j3ERo?h5r0)| zICLBKbe?G2vp|(^k9N|`J?HvZt>2wpVe{?vl-=6JGpz-TK3d;sIk3C)3dbHF2kAW% JC% zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@(f$yN#_z<(~EH6n0C+N7(%EUb$!OMz%eRR2JFH-5|GHLnm<0`-Y3Ve5|JJ zaVTiCNr~TM{d3~R-Rkq0Y?SVtv^FzsxAD8hV-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_hu~PE%up=MPU7r zY@T)<*$Zrwvhr%divP_Jw-56)2eZ$(@!CIH`wwjBHto08=QJ7ZPOO~-mQ%A2wre$V z1MTULt-yWW)yV71KWm258>^V zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66ZYAMEk8vb?qx+`>lbNFswTC+73i7bkwpx@#7_!9jg7q?xx{#+f)78_8>V$ zJ3;$Ti;8SNGHKgR+R|xv-J`|6gRuy#-(H(vV( zYyW}GtkZsLeL$1Z?%3K%U^yxKV7qQ3SFqeW4k54@qT}I zKZ;%npbKGop-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$caiiUdm^^E1?h97SMJ`!_ z#PUDN*c`jK*+!Rdxz*XXJ=Q!SZ>&=`EwoMgy4nsTe@kzp6^QQrvfnP`g@Rq8`fXe5 zz1p^Ex5dEXUfqvu7#Q*x82&mRwQbt&VQXGnV!QOyB^$2kvbM^_3AU{koK}3cJ8auk zcUiStmDucGC2xLs?_1laOU-Pu+^uXJN_A}W+EnbUk7`=EEiko-nd@saZ}JA)({{&f ak42i zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+|~_(e7kSp0yPB1*=0Mmn?x6F=T#}vDtocv&~E1KTeq$ZKX`yY> z*VT3)`KNjttw415m;H7@FBI&8)oO zy&#M`wD=jU;h|NQT2u^-1ve*iB7+MlqT}IKZ;%npbKGop(^b diff --git a/test/expect/ModelTester.test_swin_b_expect.pkl b/test/expect/ModelTester.test_swin_b_expect.pkl index a0851737cab14b767d92abd0a1bc6011e262cd0f..2ae40af400fa9a201530fbb9760d67397be0812a 100644 GIT binary patch delta 230 zcmV1WUID$L^^!ht@y@>d`Jg}e zy8FIW)jhwkfxx_M>KeX$sfoVQ0TI91ZM{C$DJi~wC9FUCyiz{wmr%czNV&agi2}ZQ zD$Tz3S{uLjUN=7g77sp|YqLHGn%BLlmRi1+&<#J*{U<**mts-AA`LCSIgZi33wcdH z6JQ*_{H}>VYY^!^bgnkO?gatA9vK9`d>co<&oocJfTvPAXo9gn#LAIB9#l)dtB}(^ gn#Nf_k|dD6P)i30v5x|SlMn*X1hJ0-gOlU}#~y`h-2eap delta 230 zcmV3EUID$n^^!h$@y@>g`Jg`p zyZgRn)jhvxfxx_i>KeX-sfoVR0TI99ZM{CgDJi~yC9FT5yiz{%mr%ccNV&aai2}ZG zD$TwGTN}UhUN=7!77sqAYqLITnb*CLmRi1*&<#Jo{U<*#mts-A4-GB9IF8Z26nRZQ zC14!C?yiYHR}kqwTCO&}`vn2NEExp9d>com5k(D{Hr z>k%Zs9oDVB`w@@6fLhPJ+p&DVu&5kApth~P_I7ALWJcQrA=a(F3KEaLURuw++_8MWw5S|EskW`Y@OEfFbUuZ?1q7x(ZKRDq%w5#I6v^Jb glwR|`hp(GHP)i30G0b`*lMn*X1ToBdB9r6-$NhM8K>z>% diff --git a/test/expect/ModelTester.test_swin_t_expect.pkl b/test/expect/ModelTester.test_swin_t_expect.pkl index b7cb40c10b561b10c7e516e84d3b43605b80df83..8fb20630cc363e5b5297d35d29eeeb15756c10c7 100644 GIT binary patch delta 230 zcmV${#<4f``3eqei}hI~70Pb(B8A$OOOlYo5NM$CW~m#sh6a;m;)nF~Lt4GBL!RqnpV9&J&+#*ojxgwf(YL&Wc$y)*g5yT@ErNQwOFIR6kVFWaLaSPd^HL`Jc7f%t??tjfbgI_MqVzz g5Ciu-;7n`2P)i30usa|4lMn*X1h6|F_><%U$F)gsX8-^I delta 230 zcmV{m#sh9a;m;=nF~Lp4GBL;RqnpL9&J&+!jR9tVA0|}LCJhS znu9q%ZaG80X#3B;>^b+n=p6aIpje~67hRgZhRbulbu|h~KLS8Pv gs08;sj!bL5P)i30Y1*ESlMn*X1Zmoyj+5j9$4O^#jQ{`u diff --git a/test/expect/ModelTester.test_vgg11_bn_expect.pkl b/test/expect/ModelTester.test_vgg11_bn_expect.pkl index f85d5c82ce5f8418f199fba3a7f20518619bcd94..ba3db474ae24bcdf4cebfb4c7b579767d223b241 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0$tyWL(5&J(+uY~yUzR%Yz2 z@K&(R{=43$;j)JHm$|Ce8JU0gCLHd!+8`5bQ)j$=&u{fs+fVjaZA|X$+0Bz5xOb|F78;^}a{;u6Se5CdUb;R#^e7 zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK634~eZf}RciCw(5akjTtWb9S&RnSb{N?C-aFA{}hQ zV!VCN3H4UnI{T|O5qJ0OF31nu?Vy#mx5AyMFEMwC)+W1IX?Ae-8 zX`9?3ZhQRGi#oH-+{v3k`z+ zZ#Fg^s3JLLUAS^kVg>;iy&b}3m<0AD$h&NyyusiJRRGEa0p6@^ATeej1gVFp1ppnX B^&tQN diff --git a/test/expect/ModelTester.test_vgg11_expect.pkl b/test/expect/ModelTester.test_vgg11_expect.pkl index 6b02f86318ed783545bdaa1f5416cf40c5e1ac24..d7a7f3da7e447b2446fc405a336095ece352fef5 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0${r`=u+wiCOy+sD~o)2lG^|MRNY`eQ==P>X=-xjgIN|J+YdtwtpP2+8nvJXSYmY;O;*< zX?r(#a$9?pY3%J?qhsT+V2Y(f#zvbE={$=)Te2%{ExW{R6~4aMvuvm7-YuWo_uTTX zwrO@U-8?gJNO;`M$ITyS3q;*Nz rbR99=%WJ#OM&Ry0>*py2wmh%S+3xU}y!%_A=$;2> zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=Y0=Zf^kRiCw<-akhS|Gxjd>QLt71zusoS6%A{X`Ks3Ev;XdGIM8qPQa0E| z&UE`8NzGQ{VK$W7Du;iltP>MjI<>i#^`i zm9~ss;h=Qj{wg diff --git a/test/expect/ModelTester.test_vgg13_bn_expect.pkl b/test/expect/ModelTester.test_vgg13_bn_expect.pkl index fb6bcdd45a3d3840cb62c542a980824e5502d58e..e872729eba388c63756f837a9d4ba9f3ad47d813 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5PZ6LNc_Z98p4?nYa$TUowG zh-sw_o6dizIRI9r`;D8%Gtd6Z)4kcu6cLPE^Zr^MV8hlGFfeFcD%BxaZK4A6)3xB+O&r@ qf6r{ONt@8LCppF0hOuPco@1WLHb$TH_Ef!Dy?2X;uI)3nJ{tgd)0{Z~ literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@-GixxI?Eowj@KL|Y$US-$5r<4T)0#dkJoN&L36J9+lJ*l}sk$MwQ{JpR75 zvUdyK^FmY5y5a9N>rM63Y%ZP1wo&-^W{+Ef&mN~Uc{Ujuu6qxJyxFb5ZEjl}HN#rT zyV=IO@6R5;S9A83#w6~2qpGoYm2CT-zwPOJ8|pso4p}H?v--b{ZP5AV-8Xh}+bmgZ zX?--4)%L=!S5_I0DZA$c%I;B^_Rwb2nJqRw6Pot0q&VA%l+4@HGd!5SV~WvNBQz*ul|GAA;)kU|c^H0A=?d~sfS zC=<|D5DxHW1X1ubi5!OlAPE$JooXBrWYCp z0p4tEI#5M&%(`&ppu`LUFnT+L%PH4Pt diff --git a/test/expect/ModelTester.test_vgg13_expect.pkl b/test/expect/ModelTester.test_vgg13_expect.pkl index b58403a951709bef18d85783eee1d6d9348147e2..ce387c2ffb75cc0ae8e5a299cd663c904c7781bd 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5PTQ*wKy>^p5Q+>5sUu%>*E z1ItRA|0?fnd{X#rZ*}wRvEFlO&*_cAdm@V3?Xuab9HS0Hx(`-`CWZTSS ze7naa*=Nt`^LaKJ+OB&Y!rtsY#%pfd8hVX39{!faOC2fJTc`MIR*)(e*1Q#13S rjn(-rHf58V_RL6gwqYrox5vUK+2-pPy*+E*t=`M&t!w* zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@*9HxxH`fI&BX;h_=4CrhJbb%SxMEm3KC7Dg3rZJv@8f?7p;T?ndD~SN^}X zlJX4RBcLs4ZNPBdy0&qe&4Sa}HnNOw_b4X&?5RATXY)?Wb#F-co84`^=C-$@XILNb zX|^ew_-9Ydn>l+!;uH4@YiR5>k#FC_(Vf1RtLfA3Rg2|p-ZI+Se!bAV`{8bG8-Zn( z*6G=-wgLNISur@L>|PfvyXVb}hc;KwZLz7C)U>BC&Dln#Y~G$z-pMu|U-b4AzgxZc zgqN=EDvmxIP-q1SZrgDJ7*ZgNJGA&2tl^-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu5P(H-5I;_?c{L=Rcc8i_h&|;{9@W@L73Vy`Lv-g4bl&006c(pS%D7 literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~uf#pKTaFldZD+f;~IWYucVL+p=fY_7!_R{99mUm3zV3?VIi1Mo*=^$BwJ- zk-21U`!3}6Zqt-oHaA{AwK3S)x=Wa`de1`N)ix>L3~g3>^4pXcWb8S%-PBg0e!u0B z-4@mo!4qwBSMRiuo4>|JYtjsBompyonx7xHY52CmhHw1|n*%pgcArz~*v%npz4wII z{$0*eVS8Fte%h=&|7MSw)#lv}ekxWjo3_~8e*AFHujs{lQg;5cxwZJ*ZWHg9yT6~2 zxBd3xq)pD63>#2rNvv7$`vfqgKp1yu@iSP%L#r&cs2CUvZcgSz1{YGuL72u|Ae%4F zOAloN+6uw}-i#m$o+gpwPyi%>0?<<^x^84Y@uBE^1>_;?);B=ci|i_X6ulBa7sB*H z!yv$$jZFutNRC+-t{jw@K>$W?hj1AtfjtTGE*mIsFnB^0fHFaVH!B-Rj2Q?)>LF?Y DE=2v9 diff --git a/test/expect/ModelTester.test_vgg16_expect.pkl b/test/expect/ModelTester.test_vgg16_expect.pkl index f2bde18780ee252c1a6e38f6b56fbb6aa91228d8..0e2e4fe0c01a6c2510bdfc87bad2d3fb0ac934ce 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu0%Z2R~abAtqZDr3HI#UDC9D zWVvOJ#jX{59s<)`ub1Xy(vCQdv~8w-{W@G+*U2}_U<3)w`>&NJ+)EU)w;`p zwR(?J;A)$BKMieW`0(2pm}KnH+ihyAF*|R+<*WS`)@MT}+CEsf(`NCaH8wXU&#=y# ztF}k*^>LePKR4KH+jPR_-W`?Q0V*B4vlXoOKJePVi&ZXckAvnUnwF1Ak81yj`DR0|571pxFQb literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=Y~MpX~-gCfiMl3-**>(zLx{xn<9q*wUOA}x{H~mde6Fm)i%?98rrPz;kQ|4oU!NnE>qiv`TH$7 z_FGuzg-*2nv2Lf$?S*S>(x%R^el|yK5990OHc3A>*yL?GVYA`3%I+`99lOO8toH`^ z@85M+Hf+xfjh{Bfm*4EUV!L_wf&djO{;gYVjz536=WN{KJ;(O`vq@TZZud?9m%H1} z%iDVWIcf7|eTEGvv>4XEyM6)~QXq^wwD=jU;h|NQT2u^-1ve*iB7+MlqT}IKZ;%npbKGo zp-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huj*WCrU>E+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMt}gP&;pwY8frGbuQS``w`{bT zAbM&~)8xxGvts*fdVB=-D!=02Yf@Bc?RVCCkBiRc-FvK$*!l@>-eVq@uqW`XpiP>y z<(}WgMm7xhXYZ<*a%;EKWyZY>74=pZP4l+xIpxV>+h?!3*Ij< zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~osdfz3n>HJdHhnQgn8H`*kKp4xM4(q$W$*gl&8AA!9BulV;0lvG+#a7-dqJTUI`LS-31CQpFz(ReXRwBcR#|FMF)$X~oXm*~E~JoyFparDHeZ~V z9?Ar?6@&x489@|0O(Msk07wD_pr=rD-N=68L(%yP$V1kxZ-A~B*;V`~dL@7^gz1Hb zL4Y?Kn+{Zw9J4N5IVdrM0F2%a;WA7DdlKYbHc;MR@PsM=Wr6^2RyL3rGZ2E*L(~EQ D3Cs61 diff --git a/test/expect/ModelTester.test_vgg19_expect.pkl b/test/expect/ModelTester.test_vgg19_expect.pkl index 31861c8c5c4ac456e441e899d89cba1707281fe3..6b42831a1c6fb75cc017d453a51f3aa1a6027e8f 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32&=mMK$9W|R}x0r3uwQsc1 zmpZlQ>-5VuY6*QdZTiYC-s|Jy#BFu<6 zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@(yp0vlZ&HJgB2%(gb|8*Sc5p4t;X{j!ZhLZ6M1zrfy4Z}|5zlvP@Px?sI0 zK!5Y@6#FB#a-y5}>_|%3BmF_pCeht;&xvv)n})lycL~h6wR_1`#=RHn>#Z^@x9zd_ zVX@ueq`KEkY3iP|Cp&G*;&$(uxy0YLA?%3NyY>w>@*maq+7!s_y&}kG+i=x!@4afoa4*SGPhAprA#XDH_G@3xGr5$2E%>q$FF?<0;z;EEhOm<8~ zpHvFOPycp8Z9sg)ru{KRbVYVVC*BT45}_wVBAXk<;Pv3dB-C!ihNqds2-L{LVxCvT g`xs@!0DUV&P)i30lkx)OlMn*X1e5XtDq7s3xGpC$1A&jL|H%zFVu6mQ_fhILFu z9#jg%aQ=2e7(aZ(>-;fAazu7StKAMoSfD3FcA6W-?DXKoJ=1Q)>ZX~)@Y2Y{z@1mb gJ{V=hhkPqUP)i30O@v9tlMn*X1WklV#*^d%$9y<#1^@s6 diff --git a/test/expect/ModelTester.test_wide_resnet50_2_expect.pkl b/test/expect/ModelTester.test_wide_resnet50_2_expect.pkl index 4447c3071d7bc39ae0b53b58cb6c3acf8e262f1d..09bbb39a69cee7be3c028b01bcd2d7ba40663303 100644 GIT binary patch delta 423 zcmZ3@KA)w&fvJlt=g%Csp9)zk0wnbsn0#L{1IgM#28{>-1_p+LqWqHl2AUMxcTSUZ6})W=Tm-YJ6&5N@ikSAyY>L7f`UIvLH3SII}9XxRANFkVPYc6)0Go zkzZ6&$m-1$!3^YO=9Lt(d9y^YfXv{^&r2_4_huE+{Qz@iy|CINuuR=Ga1p2yAK@6ZMRMMu32I6O-eyP3sOkdG>Mt)^~yj z6E;mez+JGwL2Aye16Iz72b5D;57w9`ADrd7!ojpB`rvk_rTgvL;vE;YcROUynZEz| z=U#i+mu?OT!7`2))|NYXSWk60YWUMm&?ryKf$9CR1D&aE`x~zdI5w3q9CWZ&-p}*k zoP+!_{)3_SH`rO#Z9TaAQ2fDjxo-~GAJ*Ofe#zSd{R<}_$Q9CZob}^|S~m9p0Ji#~Hvj+t literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66ZrBlcU+Dbq9K$ecYe(PVnHLjS~+{%wOQpKl|2!Pfm#k9;UD!d}o$?P{(zJ z!>;b=gDg%<_it;BcT{cfcIcWteLutJUi;%O+#I-qWgMl}mODJMn(FYtf~2u7mc=<{kir*4vr(drtsE3WRZo7C(bEJhaMEi;97<;O1mbWN;yc9E54i1+w|# zy!22epsgSr;LQl4;As*$4h29GC;&Z$qU%QX6CaAsS3n-JZhZrEy~wWON6{+*bRkSH zGz torch.Tensor: @@ -23,6 +20,20 @@ def get_relative_position_index(height: int, width: int) -> torch.Tensor: return relative_coords.sum(-1) +class GeluWrapper(nn.Module): + """ + Gelu wrapper to make it compatible with `ConvNormActivation2D` which passed inplace=True + to the activation function construction. + """ + + def __init__(self, **kwargs) -> None: + super().__init__() + self._op = F.gelu + + def forward(self, x: Tensor) -> Tensor: + return self._op(x) + + class MBConv(nn.Module): def __init__( self, @@ -54,28 +65,20 @@ def __init__( _layers = OrderedDict() _layers["pre_norm"] = normalization_fn(in_channels) _layers["conv_a"] = Conv2dNormActivation( - in_channels, - mid_channels, - kernel_size=1, - stride=1, - padding=0, - activation_layer=activation_fn, - norm_layer=normalization_fn, - inplace=None, + in_channels, mid_channels, 1, 1, 0, activation_layer=activation_fn, norm_layer=normalization_fn ) _layers["conv_b"] = Conv2dNormActivation( mid_channels, mid_channels, - kernel_size=3, - stride=stride, - padding=1, + 3, + stride, + 1, activation_layer=activation_fn, norm_layer=normalization_fn, groups=mid_channels, - inplace=None, ) _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels) - _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=True) + _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=False) self.layers = nn.Sequential(_layers) @@ -113,13 +116,14 @@ def __init__( # initialize with truncated normal the bias self.positional_bias.data.normal_(mean=0, std=0.02) - def get_relative_positional_bias(self) -> torch.Tensor: + def _get_relative_positional_bias(self) -> torch.Tensor: bias_index = self.relative_position_index.view(-1) # type: ignore relative_bias = self.positional_bias[bias_index].view(self.max_seq_len, self.max_seq_len, -1) # type: ignore relative_bias = relative_bias.permute(2, 0, 1).contiguous() return relative_bias.unsqueeze(0) def forward(self, x: Tensor) -> Tensor: + # X, Y and stand for X-axis group dim, Y-axis group dim B, G, P, D = x.shape H, DH = self.n_heads, self.head_dim @@ -131,8 +135,9 @@ def forward(self, x: Tensor) -> Tensor: v = v.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4) k = k * self.scale_factor + # X, Y and stand for X-axis group dim, Y-axis group dim dot_prod = torch.einsum("B G H I D, B G H J D -> B G H I J", q, k) - pos_bias = self.get_relative_positional_bias() + pos_bias = self._get_relative_positional_bias() dot_prod = F.softmax(dot_prod + pos_bias, dim=-1) @@ -199,6 +204,34 @@ def forward(self, x: Tensor) -> Tensor: return x +class MLP(nn.Module): + def __init__( + self, + in_dim: int, + hidden_dim: int, + activation_fn: Callable[..., nn.Module], + normalization_fn: Callable[..., nn.Module], + dropout: float, + ) -> None: + super().__init__() + self.in_dim = in_dim + self.hidden_dim = hidden_dim + self.activation_fn = activation_fn + self.normalization_fn = normalization_fn + self.dropout = dropout + + self.layers = nn.Sequential( + self.normalization_fn(in_dim), + nn.Linear(in_dim, hidden_dim), + self.activation_fn(), + nn.Linear(hidden_dim, in_dim), + nn.Dropout(dropout), + ) + + def forward(self, x: Tensor) -> Tensor: + return x + self.layers(x) + + class PartitionAttentionLayer(nn.Module): def __init__( self, @@ -249,14 +282,7 @@ def __init__( nn.Dropout(attn_dropout), ) - # pre-normalization similar to transformer layers - self.mlp_layer = nn.Sequential( - nn.LayerNorm(in_channels), - nn.Linear(in_channels, in_channels * mlp_ratio), - activation_fn(), - nn.Linear(in_channels * mlp_ratio, in_channels), - nn.Dropout(mlp_dropout), - ) + self.mlp_layer = MLP(in_channels, in_channels * mlp_ratio, activation_fn, normalization_fn, mlp_dropout) # layer scale factors self.attn_layer_scale = nn.parameter.Parameter(torch.ones(in_channels) * 1e-6) @@ -264,8 +290,8 @@ def __init__( def forward(self, x: Tensor) -> Tensor: x = self.partition_op(x) - x = x + self.attn_layer(x) * self.attn_layer_scale - x = x + self.mlp_layer(x) * self.mlp_layer_scale + x = self.attn_layer(x) * self.attn_layer_scale + x = self.mlp_layer(x) * self.mlp_layer_scale x = self.departition_op(x) return x @@ -360,8 +386,9 @@ def __init__( p_stochastic: List[float], ) -> None: super().__init__() - if not len(p_stochastic) == n_layers: - raise ValueError(f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}.") + assert ( + len(p_stochastic) == n_layers + ), f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}." self.layers = nn.ModuleList() # account for the first stride of the first layer @@ -397,12 +424,11 @@ def forward(self, x: Tensor) -> Tensor: class MaxVit(nn.Module): def __init__( self, - # input size parameters - input_size: Tuple[int, int], # stem and task parameters input_channels: int, stem_channels: int, - num_classes: int, + input_size: Tuple[int, int], + out_classes: int, # block parameters block_channels: List[int], block_layers: List[int], @@ -424,7 +450,6 @@ def __init__( partition_size: int, ) -> None: super().__init__() - _log_api_usage_once(self) # stem self.stem = nn.Sequential( @@ -475,7 +500,7 @@ def __init__( self.classifier = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Flatten(), - nn.Linear(block_channels[-1], num_classes, bias=False), + nn.Linear(block_channels[-1], out_classes, bias=False), ) def forward(self, x: Tensor) -> Tensor: @@ -486,87 +511,85 @@ def forward(self, x: Tensor) -> Tensor: return x -def _maxvit( - # stem and task parameters - stem_channels: int, - num_classes: int, - # block parameters - block_channels: List[int], - block_layers: List[int], - stochastic_depth_prob: float, - # conv parameters - squeeze_ratio: float, - expansion_ratio: float, - # conv + transformer parameters - # normalization_fn is applied only to the conv layers - # activation_fn is applied both to conv and transformer layers - normalization_fn: Callable[..., nn.Module], - activation_fn: Callable[..., nn.Module], - # transformer parameters - head_dim: int, - mlp_ratio: int, - mlp_dropout: float, - attn_dropout: float, - # partitioning parameters - partition_size: int, - # Weights API - weights: Optional[WeightsEnum], - progress: bool, - # kwargs, - **kwargs, -) -> MaxVit: - if weights is not None: - _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) - assert weights.meta["min_size"][0] == weights.meta["min_size"][1] - _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"][0]) - _ovewrite_named_param(kwargs, "input_channels", weights.meta["input_channels"]) - - input_size = kwargs.pop("input_size", (224, 224)) - input_channels = kwargs.pop("input_channels", 3) - - model = MaxVit( - input_channels=input_channels, - stem_channels=stem_channels, - num_classes=num_classes, - block_channels=block_channels, - block_layers=block_layers, - stochastic_depth_prob=stochastic_depth_prob, - squeeze_ratio=squeeze_ratio, - expansion_ratio=expansion_ratio, - normalization_fn=normalization_fn, - activation_fn=activation_fn, - head_dim=head_dim, - mlp_ratio=mlp_ratio, - mlp_dropout=mlp_dropout, - attn_dropout=attn_dropout, - partition_size=partition_size, - input_size=input_size, - **kwargs, +def max_vit_T_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, + stem_channels=64, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[64, 128, 256, 512], + block_layers=[2, 2, 5, 2], + stochastic_depth_prob=0.2, + squeeze_ratio=0.25, + expansion_ratio=4.0, + normalization_fn=nn.BatchNorm2d, + activation_fn=GeluWrapper, + head_dim=32, + mlp_ratio=2, + mlp_dropout=0.0, + attn_dropout=0.0, + partition_size=7, ) - if weights is not None: - model.load_state_dict(weights.get_state_dict(progress=progress)) - return model +def max_vit_S_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, + stem_channels=64, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[96, 192, 384, 768], + block_layers=[2, 2, 5, 2], + stochastic_depth_prob=0.3, + squeeze_ratio=0.25, + expansion_ratio=4.0, + normalization_fn=nn.BatchNorm2d, + activation_fn=GeluWrapper, + head_dim=32, + mlp_ratio=2, + mlp_dropout=0.0, + attn_dropout=0.0, + partition_size=7, + ) -@register_model(name="maxvit_t") -def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, **kwargs: Any) -> MaxVit: - return _maxvit( +def max_vit_B_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, stem_channels=64, - block_channels=[64, 128, 256, 512], - block_layers=[2, 2, 5, 2], - stochastic_depth_prob=0.2, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[96, 192, 384, 768], + block_layers=[2, 6, 14, 2], + stochastic_depth_prob=0.4, + squeeze_ratio=0.25, + expansion_ratio=4.0, + normalization_fn=nn.BatchNorm2d, + activation_fn=GeluWrapper, + head_dim=32, + mlp_ratio=2, + mlp_dropout=0.0, + attn_dropout=0.0, + partition_size=7, + ) + + +def max_vit_L_224(num_classes: int) -> MaxVit: + return MaxVit( + input_channels=3, + stem_channels=128, + input_size=(224, 224), + out_classes=num_classes, + block_channels=[128, 256, 512, 1024], + block_layers=[2, 6, 14, 2], + stochastic_depth_prob=0.6, squeeze_ratio=0.25, expansion_ratio=4.0, normalization_fn=nn.BatchNorm2d, - activation_fn=nn.GELU, + activation_fn=GeluWrapper, head_dim=32, mlp_ratio=2, mlp_dropout=0.0, attn_dropout=0.0, partition_size=7, - weights=weights, - progress=progress, - **kwargs, ) From aa95139c0c7596f2bdf3174e84522708b5ee99ab Mon Sep 17 00:00:00 2001 From: Ponku Date: Fri, 5 Aug 2022 20:30:08 +0100 Subject: [PATCH 04/23] Re-added model changes after revert --- .../ModelTester.test_max_vit_L_224_expect.pkl | Bin 939 -> 0 bytes .../ModelTester.test_max_vit_S_224_expect.pkl | Bin 939 -> 0 bytes .../ModelTester.test_max_vit_T_224_expect.pkl | Bin 939 -> 0 bytes ...l => ModelTester.test_maxvit_t_expect.pkl} | Bin 939 -> 939 bytes test/test_models.py | 8 - torchvision/models/maxvit.py | 233 ++++++++---------- 6 files changed, 105 insertions(+), 136 deletions(-) delete mode 100644 test/expect/ModelTester.test_max_vit_L_224_expect.pkl delete mode 100644 test/expect/ModelTester.test_max_vit_S_224_expect.pkl delete mode 100644 test/expect/ModelTester.test_max_vit_T_224_expect.pkl rename test/expect/{ModelTester.test_max_vit_B_224_expect.pkl => ModelTester.test_maxvit_t_expect.pkl} (61%) diff --git a/test/expect/ModelTester.test_max_vit_L_224_expect.pkl b/test/expect/ModelTester.test_max_vit_L_224_expect.pkl deleted file mode 100644 index 27777e11ee666c4ab536ab15699ab0f02af679b7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@*M40YgPM2SYwzIirRtY8#@~=x=mcp1Em`=w`z&M>!0yJ!~};`Mi6Bd{d@ql;*Kv6QjPu#vM0|H(C8q-oz@zu~Fb=*T!1ajfP2`{|s49 zux#8Usbwg^Q@62ff|gP7W`3hL7m_yo-{iN6bL0GtyL=ZLYGrRSeBEhlB=Kyoq21;d zBWbJU8#TJ-ZEQYjVbmg$W|T4Qn85^TiA`Gn&uz#oXx!wK$GmZ&mhYxF2e;_w#;R^I zKReT4>*tRfL80}=qWIwnU`T;5?$F|Au!e_LS!z)+Fc#dL%!v#xq>zI!jk!QJU!0d7 z$^^6(gaf=8K@>bqBFCWsNCE|*r%-g=$bRBO(fJC4k zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK635C=-B9gR@TP~042-%&S&aUe-8Nu3qqE5=N_}H{=QN|nH(Z+v1dba@#HejZ z33l3a!TRNf*_XLDHur09GLczn*rIrC!y;L=O%aROjMiD-+IZl=Nqw`ZHyhb@Wf=Aw zp4)h2HTS0V-nAP`4$a-<B~~16iew%OCFbP;0?((Qrc){bnEbPL!bYZ zhJr6*3?HmaFr2VHXT!Cfy&Em0TMUb2eT`nUnHWy<*=4x@;GT{83*0u%+x%$bwA;Fd zC!QD^DM(&j4+^c9PnVjW0EQF@;|?u;25Wd|m8BLH17pF>$(+dGLJBzu)0hin^Tm1T zp-ezqK{&vh5k$e$Byt=IfFw`=dJ09?jqE2r6rHbtJY?Pa2IzW`UB!>0R|4olm|kcY z1bDNt=|C09G3&yWgAy|c!07D|F2f|SCqdq21LX|{PpAS=CJ69mWdn&Z10hH~L@fZ4 Cu=W@L diff --git a/test/expect/ModelTester.test_max_vit_T_224_expect.pkl b/test/expect/ModelTester.test_max_vit_T_224_expect.pkl deleted file mode 100644 index 3db84d2e8e5fb079c737a84b6f8cc3dec9b07f4c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66ZjklfFsC*-aB1nK#a0x@d4e=aNxFLiffSH8z_RLP|GDTz|RohM1<2-MqUS z8CEN;f1Ai#`nrNhZ-OA{`ve~AXx=KSC z9yKE#sjY^3J1%YHn9jG6E8oyCO=P-Z$DOi`#hX}+XC~@x`ovhd(d*NZjar!!hAbOx zH+jvBFuH$nx{=AF^&8GSwA@rs;9~ScTfj(U$CgbJ2i|U6qwKQr{9|vUP0wXF3Eg?P zadrWV(Nd-mBT#6qQhHW$0vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1l5S;4M4?e5t(b-`2burV%_q$_G3^R5dw}X`4Ll$}c?7 z4mCV6l>@zO=cc=e$%VTbt*Jbq-?uwp`;5D^RhK-mh{?PPorY07DBBf1zF085G=43+ zHznUY0s3`2WU^blbXz4m_&;Jjcdv-NOz6rx;BD4BsQ(!}%$d|W7;UXQ5+dBZuK?9N gMG}KMERLzXP)i30RHp8^lMn*X1XQN(xs&7q$44S-R{#J2 delta 230 zcmV4Mh%rLU~Tih`Lh`q56djhh|_yMxWJEO5qxOlP(?k7>P g0JID;1&|i9P)i30bjU*llMn*X1a!zl1e4?f$A>y-{Qv*} diff --git a/test/test_models.py b/test/test_models.py index d42668cea2f..5ab0640a70e 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -594,14 +594,6 @@ def test_vitc_models(model_fn, dev): test_classification_model(model_fn, dev) -@pytest.mark.parametrize( - "model_fn", [models.max_vit_T_224, models.max_vit_S_224, models.max_vit_B_224, models.max_vit_L_224] -) -@pytest.mark.parametrize("dev", cpu_and_gpu()) -def test_max_vit(model_fn, dev): - test_classification_model(model_fn, dev) - - @pytest.mark.parametrize("model_fn", list_model_fns(models)) @pytest.mark.parametrize("dev", cpu_and_gpu()) def test_classification_model(model_fn, dev): diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 72a9240c0be..54d0290397f 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -1,12 +1,15 @@ import math -from typing import Any, Callable, List, OrderedDict, Sequence, Tuple +from typing import Any, Callable, List, Optional, OrderedDict, Sequence, Tuple import numpy as np import torch import torch.nn.functional as F from torch import nn, Tensor +from torchvision.models._api import register_model, WeightsEnum +from torchvision.models._utils import _ovewrite_named_param from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation from torchvision.ops.stochastic_depth import StochasticDepth +from torchvision.utils import _log_api_usage_once def get_relative_position_index(height: int, width: int) -> torch.Tensor: @@ -20,20 +23,6 @@ def get_relative_position_index(height: int, width: int) -> torch.Tensor: return relative_coords.sum(-1) -class GeluWrapper(nn.Module): - """ - Gelu wrapper to make it compatible with `ConvNormActivation2D` which passed inplace=True - to the activation function construction. - """ - - def __init__(self, **kwargs) -> None: - super().__init__() - self._op = F.gelu - - def forward(self, x: Tensor) -> Tensor: - return self._op(x) - - class MBConv(nn.Module): def __init__( self, @@ -65,20 +54,28 @@ def __init__( _layers = OrderedDict() _layers["pre_norm"] = normalization_fn(in_channels) _layers["conv_a"] = Conv2dNormActivation( - in_channels, mid_channels, 1, 1, 0, activation_layer=activation_fn, norm_layer=normalization_fn + in_channels, + mid_channels, + kernel_size=1, + stride=1, + padding=0, + activation_layer=activation_fn, + norm_layer=normalization_fn, + inplace=None, ) _layers["conv_b"] = Conv2dNormActivation( mid_channels, mid_channels, - 3, - stride, - 1, + kernel_size=3, + stride=stride, + padding=1, activation_layer=activation_fn, norm_layer=normalization_fn, groups=mid_channels, + inplace=None, ) _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels) - _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=False) + _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=True) self.layers = nn.Sequential(_layers) @@ -116,14 +113,13 @@ def __init__( # initialize with truncated normal the bias self.positional_bias.data.normal_(mean=0, std=0.02) - def _get_relative_positional_bias(self) -> torch.Tensor: + def get_relative_positional_bias(self) -> torch.Tensor: bias_index = self.relative_position_index.view(-1) # type: ignore relative_bias = self.positional_bias[bias_index].view(self.max_seq_len, self.max_seq_len, -1) # type: ignore relative_bias = relative_bias.permute(2, 0, 1).contiguous() return relative_bias.unsqueeze(0) def forward(self, x: Tensor) -> Tensor: - # X, Y and stand for X-axis group dim, Y-axis group dim B, G, P, D = x.shape H, DH = self.n_heads, self.head_dim @@ -135,9 +131,8 @@ def forward(self, x: Tensor) -> Tensor: v = v.reshape(B, G, P, H, DH).permute(0, 1, 3, 2, 4) k = k * self.scale_factor - # X, Y and stand for X-axis group dim, Y-axis group dim dot_prod = torch.einsum("B G H I D, B G H J D -> B G H I J", q, k) - pos_bias = self._get_relative_positional_bias() + pos_bias = self.get_relative_positional_bias() dot_prod = F.softmax(dot_prod + pos_bias, dim=-1) @@ -204,34 +199,6 @@ def forward(self, x: Tensor) -> Tensor: return x -class MLP(nn.Module): - def __init__( - self, - in_dim: int, - hidden_dim: int, - activation_fn: Callable[..., nn.Module], - normalization_fn: Callable[..., nn.Module], - dropout: float, - ) -> None: - super().__init__() - self.in_dim = in_dim - self.hidden_dim = hidden_dim - self.activation_fn = activation_fn - self.normalization_fn = normalization_fn - self.dropout = dropout - - self.layers = nn.Sequential( - self.normalization_fn(in_dim), - nn.Linear(in_dim, hidden_dim), - self.activation_fn(), - nn.Linear(hidden_dim, in_dim), - nn.Dropout(dropout), - ) - - def forward(self, x: Tensor) -> Tensor: - return x + self.layers(x) - - class PartitionAttentionLayer(nn.Module): def __init__( self, @@ -282,7 +249,14 @@ def __init__( nn.Dropout(attn_dropout), ) - self.mlp_layer = MLP(in_channels, in_channels * mlp_ratio, activation_fn, normalization_fn, mlp_dropout) + # pre-normalization similar to transformer layers + self.mlp_layer = nn.Sequential( + nn.LayerNorm(in_channels), + nn.Linear(in_channels, in_channels * mlp_ratio), + activation_fn(), + nn.Linear(in_channels * mlp_ratio, in_channels), + nn.Dropout(mlp_dropout), + ) # layer scale factors self.attn_layer_scale = nn.parameter.Parameter(torch.ones(in_channels) * 1e-6) @@ -290,8 +264,8 @@ def __init__( def forward(self, x: Tensor) -> Tensor: x = self.partition_op(x) - x = self.attn_layer(x) * self.attn_layer_scale - x = self.mlp_layer(x) * self.mlp_layer_scale + x = x + self.attn_layer(x) * self.attn_layer_scale + x = x + self.mlp_layer(x) * self.mlp_layer_scale x = self.departition_op(x) return x @@ -386,9 +360,8 @@ def __init__( p_stochastic: List[float], ) -> None: super().__init__() - assert ( - len(p_stochastic) == n_layers - ), f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}." + if not len(p_stochastic) == n_layers: + raise ValueError(f"p_stochastic must have length n_layers={n_layers}, got p_stochastic={p_stochastic}.") self.layers = nn.ModuleList() # account for the first stride of the first layer @@ -424,11 +397,12 @@ def forward(self, x: Tensor) -> Tensor: class MaxVit(nn.Module): def __init__( self, + # input size parameters + input_size: Tuple[int, int], # stem and task parameters input_channels: int, stem_channels: int, - input_size: Tuple[int, int], - out_classes: int, + num_classes: int, # block parameters block_channels: List[int], block_layers: List[int], @@ -450,6 +424,7 @@ def __init__( partition_size: int, ) -> None: super().__init__() + _log_api_usage_once(self) # stem self.stem = nn.Sequential( @@ -500,7 +475,7 @@ def __init__( self.classifier = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Flatten(), - nn.Linear(block_channels[-1], out_classes, bias=False), + nn.Linear(block_channels[-1], num_classes, bias=False), ) def forward(self, x: Tensor) -> Tensor: @@ -511,85 +486,87 @@ def forward(self, x: Tensor) -> Tensor: return x -def max_vit_T_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, - stem_channels=64, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[64, 128, 256, 512], - block_layers=[2, 2, 5, 2], - stochastic_depth_prob=0.2, - squeeze_ratio=0.25, - expansion_ratio=4.0, - normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, - head_dim=32, - mlp_ratio=2, - mlp_dropout=0.0, - attn_dropout=0.0, - partition_size=7, +def _maxvit( + # stem and task parameters + stem_channels: int, + num_classes: int, + # block parameters + block_channels: List[int], + block_layers: List[int], + stochastic_depth_prob: float, + # conv parameters + squeeze_ratio: float, + expansion_ratio: float, + # conv + transformer parameters + # normalization_fn is applied only to the conv layers + # activation_fn is applied both to conv and transformer layers + normalization_fn: Callable[..., nn.Module], + activation_fn: Callable[..., nn.Module], + # transformer parameters + head_dim: int, + mlp_ratio: int, + mlp_dropout: float, + attn_dropout: float, + # partitioning parameters + partition_size: int, + # Weights API + weights: Optional[WeightsEnum], + progress: bool, + # kwargs, + **kwargs, +) -> MaxVit: + if weights is not None: + _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) + assert weights.meta["min_size"][0] == weights.meta["min_size"][1] + _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"][0]) + _ovewrite_named_param(kwargs, "input_channels", weights.meta["input_channels"]) + + input_size = kwargs.pop("input_size", (224, 224)) + input_channels = kwargs.pop("input_channels", 3) + + model = MaxVit( + input_channels=input_channels, + stem_channels=stem_channels, + num_classes=num_classes, + block_channels=block_channels, + block_layers=block_layers, + stochastic_depth_prob=stochastic_depth_prob, + squeeze_ratio=squeeze_ratio, + expansion_ratio=expansion_ratio, + normalization_fn=normalization_fn, + activation_fn=activation_fn, + head_dim=head_dim, + mlp_ratio=mlp_ratio, + mlp_dropout=mlp_dropout, + attn_dropout=attn_dropout, + partition_size=partition_size, + input_size=input_size, + **kwargs, ) + if weights is not None: + model.load_state_dict(weights.get_state_dict(progress=progress)) -def max_vit_S_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, - stem_channels=64, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[96, 192, 384, 768], - block_layers=[2, 2, 5, 2], - stochastic_depth_prob=0.3, - squeeze_ratio=0.25, - expansion_ratio=4.0, - normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, - head_dim=32, - mlp_ratio=2, - mlp_dropout=0.0, - attn_dropout=0.0, - partition_size=7, - ) + return model -def max_vit_B_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, +@register_model(name="maxvit_t") +def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, **kwargs: Any) -> MaxVit: + return _maxvit( stem_channels=64, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[96, 192, 384, 768], - block_layers=[2, 6, 14, 2], - stochastic_depth_prob=0.4, - squeeze_ratio=0.25, - expansion_ratio=4.0, - normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, - head_dim=32, - mlp_ratio=2, - mlp_dropout=0.0, - attn_dropout=0.0, - partition_size=7, - ) - - -def max_vit_L_224(num_classes: int) -> MaxVit: - return MaxVit( - input_channels=3, - stem_channels=128, - input_size=(224, 224), - out_classes=num_classes, - block_channels=[128, 256, 512, 1024], - block_layers=[2, 6, 14, 2], - stochastic_depth_prob=0.6, + block_channels=[64, 128, 256, 512], + block_layers=[2, 2, 5, 2], + stochastic_depth_prob=0.2, squeeze_ratio=0.25, expansion_ratio=4.0, normalization_fn=nn.BatchNorm2d, - activation_fn=GeluWrapper, + activation_fn=nn.GELU, head_dim=32, mlp_ratio=2, mlp_dropout=0.0, attn_dropout=0.0, partition_size=7, + weights=weights, + progress=progress, + **kwargs, ) From 1fddecc8d4e3cadafa33993dc82b0f82b93a8dd7 Mon Sep 17 00:00:00 2001 From: TeodorPoncu Date: Wed, 14 Sep 2022 17:04:33 +0000 Subject: [PATCH 05/23] aligned with partial original implementation --- references/classification/presets.py | 6 +- .../classification/run_with_submitit.py | 122 ++++++++++++++ references/classification/train.py | 47 ++++-- references/classification/utils.py | 12 ++ torchvision/models/maxvit.py | 152 ++++++++++++------ 5 files changed, 280 insertions(+), 59 deletions(-) create mode 100644 references/classification/run_with_submitit.py diff --git a/references/classification/presets.py b/references/classification/presets.py index 6bc38e72953..ffc6d5b77d6 100644 --- a/references/classification/presets.py +++ b/references/classification/presets.py @@ -13,14 +13,16 @@ def __init__( interpolation=InterpolationMode.BILINEAR, hflip_prob=0.5, auto_augment_policy=None, + policy_magnitude=9, random_erase_prob=0.0, + center_crop=False, ): - trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] + trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] if center_crop else [transforms.CenterCrop(crop_size)] if hflip_prob > 0: trans.append(transforms.RandomHorizontalFlip(hflip_prob)) if auto_augment_policy is not None: if auto_augment_policy == "ra": - trans.append(autoaugment.RandAugment(interpolation=interpolation)) + trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=policy_magnitude)) elif auto_augment_policy == "ta_wide": trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation)) elif auto_augment_policy == "augmix": diff --git a/references/classification/run_with_submitit.py b/references/classification/run_with_submitit.py new file mode 100644 index 00000000000..faa099a36d7 --- /dev/null +++ b/references/classification/run_with_submitit.py @@ -0,0 +1,122 @@ +import argparse +import os +import uuid +from pathlib import Path + +import train +import submitit + + +def parse_args(): + train_parser = train.get_args_parser(add_help=False) + parser = argparse.ArgumentParser("Submitit for train", parents=[train_parser], add_help=True) + parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") + parser.add_argument("--nodes", default=1, type=int, help="Number of nodes to request") + parser.add_argument("--timeout", default=60*24*30, type=int, help="Duration of the job") + parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") + parser.add_argument("--partition", default="train", type=str, help="the partition (default train).") + return parser.parse_args() + + +def get_shared_folder() -> Path: + user = os.getenv("USER") + path = "/data/checkpoints" + if Path(path).is_dir(): + p = Path(f"{path}/{user}/experiments") + p.mkdir(exist_ok=True) + return p + raise RuntimeError("No shared folder available") + + +def get_init_file_folder() -> Path: + user = os.getenv("USER") + path = "/shared" + if Path(path).is_dir(): + p = Path(f"{path}/{user}") + p.mkdir(exist_ok=True) + return p + raise RuntimeError("No shared folder available") + + +def get_init_file(): + # Init file must not exist, but it's parent dir must exist. + os.makedirs(str(get_init_file_folder()), exist_ok=True) + init_file = get_init_file_folder() / f"{uuid.uuid4().hex}_init" + if init_file.exists(): + os.remove(str(init_file)) + return init_file + + +class Trainer(object): + def __init__(self, args): + self.args = args + + def __call__(self): + import train + + self._setup_gpu_args() + train.main(self.args) + + def checkpoint(self): + import os + import submitit + from pathlib import Path + + self.args.dist_url = get_init_file().as_uri() + checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") + if os.path.exists(checkpoint_file): + self.args.resume = checkpoint_file + print("Requeuing ", self.args) + empty_trainer = type(self)(self.args) + return submitit.helpers.DelayedSubmission(empty_trainer) + + def _setup_gpu_args(self): + import submitit + from pathlib import Path + + job_env = submitit.JobEnvironment() + self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) + self.args.gpu = job_env.local_rank + self.args.rank = job_env.global_rank + self.args.world_size = job_env.num_tasks + print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") + + +def main(): + args = parse_args() + if args.job_dir == "": + args.job_dir = get_shared_folder() / "%j" + + # Note that the folder will depend on the job_id, to easily track experiments + executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=300) + + # cluster setup is defined by environment variables + num_gpus_per_node = args.ngpus + nodes = args.nodes + timeout_min = args.timeout + + executor.update_parameters( + #mem_gb=96 * num_gpus_per_node, # 768GB per machine + gpus_per_node=num_gpus_per_node, + tasks_per_node=num_gpus_per_node, # one task per GPU + cpus_per_task=12, # 96 cpus per machine + nodes=nodes, + timeout_min=timeout_min, # max is 60 * 72 + slurm_partition=args.partition, + slurm_signal_delay_s=120, + ) + + + executor.update_parameters(name="torchvision") + + args.dist_url = get_init_file().as_uri() + args.output_dir = args.job_dir + + trainer = Trainer(args) + job = executor.submit(trainer) + + print("Submitted job_id:", job.job_id) + + +if __name__ == "__main__": + main() diff --git a/references/classification/train.py b/references/classification/train.py index 96703bfdf85..d9e2f3f7a83 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -1,5 +1,6 @@ import datetime import os +import random import time import warnings @@ -15,7 +16,7 @@ from torchvision.transforms.functional import InterpolationMode -def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None): +def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None, scheduler=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}")) @@ -43,6 +44,9 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, arg if args.clip_grad_norm is not None: nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() + + if scheduler is not None and args.lr_step_every_batch: + scheduler.step() if model_ema and i % args.model_ema_steps == 0: model_ema.update_parameters(model) @@ -113,7 +117,7 @@ def _get_cache_path(filepath): def load_data(traindir, valdir, args): # Data loading code print("Loading data") - val_resize_size, val_crop_size, train_crop_size = args.val_resize_size, args.val_crop_size, args.train_crop_size + val_resize_size, val_crop_size, train_crop_size, center_crop, policy_magnitude = args.val_resize_size, args.val_crop_size, args.train_crop_size, args.train_center_crop, args.policy_magnitude interpolation = InterpolationMode(args.interpolation) print("Loading training data") @@ -129,10 +133,12 @@ def load_data(traindir, valdir, args): dataset = torchvision.datasets.ImageFolder( traindir, presets.ClassificationPresetTrain( + center_crop=center_crop, crop_size=train_crop_size, interpolation=interpolation, auto_augment_policy=auto_augment_policy, random_erase_prob=random_erase_prob, + policy_magnitude=policy_magnitude, ), ) if args.cache_dataset: @@ -182,7 +188,12 @@ def load_data(traindir, valdir, args): def main(args): if args.output_dir: utils.mkdir(args.output_dir) - + + if args.seed is None: + # randomly choose a seed + args.seed = random.randint(0, 2 ** 32) + utils.set_seed(args.seed) + utils.init_distributed_mode(args) print(args) @@ -261,13 +272,21 @@ def main(args): raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD, RMSprop and AdamW are supported.") scaler = torch.cuda.amp.GradScaler() if args.amp else None + + batches_per_epoch = len(data_loader) + warmup_iters = args.lr_warmup_epochs + total_iters = args.epochs + + if args.lr_step_every_batch: + warmup_iters *= batches_per_epoch + total_iters *= batches_per_epoch args.lr_scheduler = args.lr_scheduler.lower() if args.lr_scheduler == "steplr": main_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) elif args.lr_scheduler == "cosineannealinglr": main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer, T_max=args.epochs - args.lr_warmup_epochs, eta_min=args.lr_min + optimizer, T_max=total_iters - warmup_iters, eta_min=args.lr_min ) elif args.lr_scheduler == "exponentiallr": main_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_gamma) @@ -280,18 +299,18 @@ def main(args): if args.lr_warmup_epochs > 0: if args.lr_warmup_method == "linear": warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs + optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters ) elif args.lr_warmup_method == "constant": warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR( - optimizer, factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs + optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters ) else: raise RuntimeError( f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported." ) lr_scheduler = torch.optim.lr_scheduler.SequentialLR( - optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[args.lr_warmup_epochs] + optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters] ) else: lr_scheduler = main_lr_scheduler @@ -341,8 +360,9 @@ def main(args): for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) - train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema, scaler) - lr_scheduler.step() + train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema, scaler, lr_scheduler) + if not args.lr_step_every_batch: + lr_scheduler.step() evaluate(model, criterion, data_loader_test, device=device) if model_ema: evaluate(model_ema, criterion, data_loader_test, device=device, log_suffix="EMA") @@ -371,7 +391,7 @@ def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description="PyTorch Classification Training", add_help=add_help) - parser.add_argument("--data-path", default="/datasets01/imagenet_full_size/061417/", type=str, help="dataset path") + parser.add_argument("--data-path", default="/datasets01_ontap/imagenet_full_size/061417/", type=str, help="dataset path") parser.add_argument("--model", default="resnet18", type=str, help="model name") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( @@ -425,6 +445,7 @@ def get_args_parser(add_help=True): parser.add_argument("--lr-step-size", default=30, type=int, help="decrease lr every step-size epochs") parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma") parser.add_argument("--lr-min", default=0.0, type=float, help="minimum lr of lr schedule (default: 0.0)") + parser.add_argument("--lr-step-every-batch", action="store_true", help="decrease lr every step-size batches", default=False) parser.add_argument("--print-freq", default=10, type=int, help="print frequency") parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs") parser.add_argument("--resume", default="", type=str, help="path of checkpoint") @@ -448,6 +469,7 @@ def get_args_parser(add_help=True): action="store_true", ) parser.add_argument("--auto-augment", default=None, type=str, help="auto augment policy (default: None)") + parser.add_argument("--policy-magnitude", default=9, type=int, help="magnitude of auto augment policy") parser.add_argument("--random-erase", default=0.0, type=float, help="random erasing probability (default: 0.0)") # Mixed precision training parameters @@ -486,13 +508,16 @@ def get_args_parser(add_help=True): parser.add_argument( "--train-crop-size", default=224, type=int, help="the random crop size used for training (default: 224)" ) + parser.add_argument( + "--train-center-crop", action="store_true", help="use center crop instead of random crop for training (default: False)" + ) parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)") parser.add_argument("--ra-sampler", action="store_true", help="whether to use Repeated Augmentation in training") parser.add_argument( "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)" ) parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load") - + parser.add_argument("--seed", default=None, type=int, help="the seed for randomness (default: None). A `None` value means a seed will be randomly generated") return parser diff --git a/references/classification/utils.py b/references/classification/utils.py index c31f3928e86..ecd6ca7012d 100644 --- a/references/classification/utils.py +++ b/references/classification/utils.py @@ -9,6 +9,8 @@ import torch import torch.distributed as dist +import numpy as np +import random class SmoothedValue: @@ -463,3 +465,13 @@ def _add_params(module, prefix=""): if len(params[key]) > 0: param_groups.append({"params": params[key], "weight_decay": params_weight_decay[key]}) return param_groups + +def set_seed(seed: int): + """ + Function for setting all the RNGs to the same seed + """ + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) \ No newline at end of file diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 54d0290397f..b0e9455a098 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -1,3 +1,4 @@ +from functools import partial import math from typing import Any, Callable, List, Optional, OrderedDict, Sequence, Tuple @@ -5,7 +6,7 @@ import torch import torch.nn.functional as F from torch import nn, Tensor -from torchvision.models._api import register_model, WeightsEnum +from torchvision.models._api import WeightsEnum from torchvision.models._utils import _ovewrite_named_param from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation from torchvision.ops.stochastic_depth import StochasticDepth @@ -33,6 +34,7 @@ def __init__( stride: int, activation_fn: Callable[..., nn.Module], normalization_fn: Callable[..., nn.Module], + p_stochastic_dropout: float = 0., ) -> None: super().__init__() @@ -41,7 +43,7 @@ def __init__( should_proj = stride != 1 or in_channels != out_channels if should_proj: - proj = [nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False)] + proj = [nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=True)] if stride == 2: proj = [nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)] + proj # type: ignore self.proj = nn.Sequential(*proj) @@ -49,7 +51,12 @@ def __init__( self.proj = nn.Identity() # type: ignore mid_channels = int(out_channels * expansion_ratio) - sqz_channels = int(mid_channels * squeeze_ratio) + sqz_channels = int(out_channels * squeeze_ratio) + + if p_stochastic_dropout: + self.stochastic_depth = StochasticDepth(p_stochastic_dropout, mode="row") # type: ignore + else: + self.stochastic_depth = nn.Identity() # type: ignore _layers = OrderedDict() _layers["pre_norm"] = normalization_fn(in_channels) @@ -74,13 +81,15 @@ def __init__( groups=mid_channels, inplace=None, ) - _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels) + _layers["squeeze_excitation"] = SqueezeExcitation(mid_channels, sqz_channels, activation=nn.SiLU) _layers["conv_c"] = nn.Conv2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1, bias=True) self.layers = nn.Sequential(_layers) def forward(self, x: Tensor) -> Tensor: - return self.layers(x) + self.proj(x) + res = self.proj(x) + x = self.stochastic_depth(self.layers(x)) + return res + x class RelativePositionalMultiHeadAttention(nn.Module): @@ -104,18 +113,17 @@ def __init__( self.scale_factor = feat_dim**-0.5 self.merge = nn.Linear(self.head_dim * self.n_heads, feat_dim) - self.positional_bias = nn.parameter.Parameter( - torch.zeros(((2 * self.size - 1) * (2 * self.size - 1), self.n_heads), dtype=torch.float32), + self.relative_position_bias_table = nn.parameter.Parameter( + torch.empty(((2 * self.size - 1) * (2 * self.size - 1), self.n_heads), dtype=torch.float32), ) self.register_buffer("relative_position_index", get_relative_position_index(self.size, self.size)) - # initialize with truncated normal the bias - self.positional_bias.data.normal_(mean=0, std=0.02) + torch.nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02) def get_relative_positional_bias(self) -> torch.Tensor: bias_index = self.relative_position_index.view(-1) # type: ignore - relative_bias = self.positional_bias[bias_index].view(self.max_seq_len, self.max_seq_len, -1) # type: ignore + relative_bias = self.relative_position_bias_table[bias_index].view(self.max_seq_len, self.max_seq_len, -1) # type: ignore relative_bias = relative_bias.permute(2, 0, 1).contiguous() return relative_bias.unsqueeze(0) @@ -153,20 +161,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: res = torch.swapaxes(x, self.a, self.b) return res - + class WindowPartition(nn.Module): """ Function that takes in a tensor of shape [B, C, H, W] and partitions it in to a tensor of shape [B, H/P, W/P, P*P, C] """ - def __init__(self, partition_size: int) -> None: + def __init__(self) -> None: super().__init__() - self.partition_size = partition_size - def forward(self, x: Tensor) -> Tensor: + def forward(self, x: Tensor, p: int) -> Tensor: B, C, H, W = x.shape - P = self.partition_size + P = p # chunk up H and W dimensions x = x.reshape(B, C, H // P, P, W // P, P) x = x.permute(0, 2, 4, 3, 5, 1) @@ -181,15 +188,13 @@ class WindowDepartition(nn.Module): and partitions it into a tensor of shape [B, C, H, W] """ - def __init__(self, partition_size: int, n_partitions: int) -> None: + def __init__(self) -> None: super().__init__() - self.partition_size = partition_size - self.n_partitions = n_partitions - def forward(self, x: Tensor) -> Tensor: + def forward(self, x: Tensor, p: int, h_partitions: int, w_partitions: int) -> Tensor: B, G, PP, C = x.shape - P = self.partition_size - HP, WP = self.n_partitions, self.n_partitions + P = p + HP, WP = h_partitions, w_partitions # split P * P dimension into 2 P tile dimensionsa x = x.reshape(B, HP, WP, P, P, C) # permute into B, C, HP, P, WP, P @@ -215,6 +220,7 @@ def __init__( normalization_fn: Callable[..., nn.Module], attn_dropout: float, mlp_dropout: float, + p_stochastic_dropout: float, ) -> None: super().__init__() @@ -222,24 +228,20 @@ def __init__( self.head_dim = head_dim self.n_partitions = grid_size[0] // partition_size self.partition_type = partition_type + self.grid_size = grid_size if partition_type not in ["grid", "window"]: raise ValueError("partition_type must be either 'grid' or 'window'") if partition_type == "window": - p, g = partition_size, self.n_partitions + self.p, self.g = partition_size, self.n_partitions else: - p, g = self.n_partitions, partition_size + self.p, self.g = self.n_partitions, partition_size - partition_op = [WindowPartition(p)] - departition_op = [WindowDepartition(p, g)] - - if partition_type == "grid": - partition_op = partition_op + [SwapAxes(-2, -3)] # type: ignore - departition_op = [SwapAxes(-2, -3)] + departition_op # type: ignore - - self.partition_op = nn.Sequential(*partition_op) - self.departition_op = nn.Sequential(*departition_op) + self.partition_op = WindowPartition() + self.departition_op = WindowDepartition() + self.partition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity() + self.departition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity() self.attn_layer = nn.Sequential( normalization_fn(in_channels), @@ -259,14 +261,28 @@ def __init__( ) # layer scale factors - self.attn_layer_scale = nn.parameter.Parameter(torch.ones(in_channels) * 1e-6) - self.mlp_layer_scale = nn.parameter.Parameter(torch.ones(in_channels) * 1e-6) + self.stochastic_dropout = StochasticDepth(p_stochastic_dropout, mode='row') + def forward(self, x: Tensor) -> Tensor: - x = self.partition_op(x) - x = x + self.attn_layer(x) * self.attn_layer_scale - x = x + self.mlp_layer(x) * self.mlp_layer_scale - x = self.departition_op(x) + B, C, H, W = x.shape + + # Undefined behavior if H or W are not divisible by p + # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 + torch._assert( + H % self.p == 0 and W % self.p == 0, + f"H and W must be divisible by partition size. Got H: {H}, W: {W}, P: {self.p}" + ) + + gh, gw = H // self.p, W // self.p + + x = self.partition_op(x, self.p) + x = self.partition_swap(x) + x = x + self.stochastic_dropout(self.attn_layer(x)) + x = x + self.stochastic_dropout(self.mlp_layer(x)) + x = self.departition_swap(x) + x = self.departition_op(x, self.p, gh, gw) + return x @@ -287,6 +303,7 @@ def __init__( mlp_ratio: int, mlp_dropout: float, attn_dropout: float, + p_stochastic_dropout: float, # partitioning parameters partition_size: int, grid_size: Tuple[int, int], @@ -304,6 +321,7 @@ def __init__( stride=stride, activation_fn=activation_fn, normalization_fn=normalization_fn, + p_stochastic_dropout=p_stochastic_dropout, ) # attention layers, block -> grid layers["window_attention"] = PartitionAttentionLayer( @@ -317,6 +335,7 @@ def __init__( normalization_fn=nn.LayerNorm, attn_dropout=attn_dropout, mlp_dropout=mlp_dropout, + p_stochastic_dropout=p_stochastic_dropout, ) layers["grid_attention"] = PartitionAttentionLayer( in_channels=out_channels, @@ -329,11 +348,13 @@ def __init__( normalization_fn=nn.LayerNorm, attn_dropout=attn_dropout, mlp_dropout=mlp_dropout, + p_stochastic_dropout=p_stochastic_dropout, ) self.layers = nn.Sequential(layers) def forward(self, x: Tensor) -> Tensor: - return self.layers(x) + x = self.layers(x) + return x class MaxVitBlock(nn.Module): @@ -384,8 +405,8 @@ def __init__( attn_dropout=attn_dropout, partition_size=partition_size, grid_size=self.grid_size, + p_stochastic_dropout=p, ), - StochasticDepth(p, mode="row"), ] def forward(self, x: Tensor) -> Tensor: @@ -429,15 +450,29 @@ def __init__( # stem self.stem = nn.Sequential( Conv2dNormActivation( - input_channels, stem_channels, 3, stride=2, norm_layer=None, activation_layer=None, bias=True + input_channels, + stem_channels, + 3, + stride=2, + norm_layer=normalization_fn, + activation_layer=activation_fn, + bias=False, + inplace=None ), Conv2dNormActivation( - stem_channels, stem_channels, 3, stride=1, norm_layer=None, activation_layer=None, bias=True + stem_channels, + stem_channels, + 3, + stride=1, + norm_layer=None, + activation_layer=None, + bias=True ), ) # account for stem stride input_size = (input_size[0] // 2, input_size[1] // 2) + self.partition_size = partition_size # blocks self.blocks = nn.ModuleList() @@ -447,7 +482,7 @@ def __init__( # precompute the stochastich depth probabilities from 0 to stochastic_depth_prob # since we have N blocks with L layers, we will have N * L probabilities uniformly distributed # over the range [0, stochastic_depth_prob] - p_stochastic = np.linspace(0, stochastic_depth_prob, num=sum(block_layers)).tolist() + p_stochastic = np.linspace(0, stochastic_depth_prob, sum(block_layers)).tolist() p_idx = 0 for in_channel, out_channel, num_layers in zip(in_channels, out_channels, block_layers): @@ -472,11 +507,18 @@ def __init__( input_size = self.blocks[-1].grid_size p_idx += num_layers + # see https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1137-L1158 + # for why there is Linear -> Tanh -> Linear self.classifier = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Flatten(), + nn.LayerNorm(block_channels[-1]), + nn.Linear(block_channels[-1], block_channels[-1]), + nn.Tanh(), nn.Linear(block_channels[-1], num_classes, bias=False), ) + + self._init_weights() def forward(self, x: Tensor) -> Tensor: x = self.stem(x) @@ -484,6 +526,20 @@ def forward(self, x: Tensor) -> Tensor: x = block(x) x = self.classifier(x) return x + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.02) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, std=0.02) + if m.bias is not None: + nn.init.zeros_(m.bias) def _maxvit( @@ -550,7 +606,6 @@ def _maxvit( return model -@register_model(name="maxvit_t") def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, **kwargs: Any) -> MaxVit: return _maxvit( stem_channels=64, @@ -559,10 +614,12 @@ def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, ** stochastic_depth_prob=0.2, squeeze_ratio=0.25, expansion_ratio=4.0, - normalization_fn=nn.BatchNorm2d, + # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1029-L1030 + # for the exact parameters used in batchnorm + normalization_fn=partial(nn.BatchNorm2d, eps=1e-3, momentum=0.99), activation_fn=nn.GELU, head_dim=32, - mlp_ratio=2, + mlp_ratio=4, mlp_dropout=0.0, attn_dropout=0.0, partition_size=7, @@ -570,3 +627,6 @@ def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, ** progress=progress, **kwargs, ) + +class MaxVit_T_Weights(WeightsEnum): + pass \ No newline at end of file From b7f0e972f1c71e11fe3dbd7cb3331eaf0344a93d Mon Sep 17 00:00:00 2001 From: Ponku Date: Wed, 14 Sep 2022 18:11:56 +0100 Subject: [PATCH 06/23] removed submitit script fixed lint --- references/classification/presets.py | 6 +- .../classification/run_with_submitit.py | 122 ------------------ references/classification/train.py | 43 ++++-- references/classification/utils.py | 8 +- torchvision/models/maxvit.py | 44 +++---- 5 files changed, 60 insertions(+), 163 deletions(-) delete mode 100644 references/classification/run_with_submitit.py diff --git a/references/classification/presets.py b/references/classification/presets.py index ffc6d5b77d6..39c0a593809 100644 --- a/references/classification/presets.py +++ b/references/classification/presets.py @@ -17,7 +17,11 @@ def __init__( random_erase_prob=0.0, center_crop=False, ): - trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] if center_crop else [transforms.CenterCrop(crop_size)] + trans = ( + [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] + if center_crop + else [transforms.CenterCrop(crop_size)] + ) if hflip_prob > 0: trans.append(transforms.RandomHorizontalFlip(hflip_prob)) if auto_augment_policy is not None: diff --git a/references/classification/run_with_submitit.py b/references/classification/run_with_submitit.py deleted file mode 100644 index faa099a36d7..00000000000 --- a/references/classification/run_with_submitit.py +++ /dev/null @@ -1,122 +0,0 @@ -import argparse -import os -import uuid -from pathlib import Path - -import train -import submitit - - -def parse_args(): - train_parser = train.get_args_parser(add_help=False) - parser = argparse.ArgumentParser("Submitit for train", parents=[train_parser], add_help=True) - parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") - parser.add_argument("--nodes", default=1, type=int, help="Number of nodes to request") - parser.add_argument("--timeout", default=60*24*30, type=int, help="Duration of the job") - parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") - parser.add_argument("--partition", default="train", type=str, help="the partition (default train).") - return parser.parse_args() - - -def get_shared_folder() -> Path: - user = os.getenv("USER") - path = "/data/checkpoints" - if Path(path).is_dir(): - p = Path(f"{path}/{user}/experiments") - p.mkdir(exist_ok=True) - return p - raise RuntimeError("No shared folder available") - - -def get_init_file_folder() -> Path: - user = os.getenv("USER") - path = "/shared" - if Path(path).is_dir(): - p = Path(f"{path}/{user}") - p.mkdir(exist_ok=True) - return p - raise RuntimeError("No shared folder available") - - -def get_init_file(): - # Init file must not exist, but it's parent dir must exist. - os.makedirs(str(get_init_file_folder()), exist_ok=True) - init_file = get_init_file_folder() / f"{uuid.uuid4().hex}_init" - if init_file.exists(): - os.remove(str(init_file)) - return init_file - - -class Trainer(object): - def __init__(self, args): - self.args = args - - def __call__(self): - import train - - self._setup_gpu_args() - train.main(self.args) - - def checkpoint(self): - import os - import submitit - from pathlib import Path - - self.args.dist_url = get_init_file().as_uri() - checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") - if os.path.exists(checkpoint_file): - self.args.resume = checkpoint_file - print("Requeuing ", self.args) - empty_trainer = type(self)(self.args) - return submitit.helpers.DelayedSubmission(empty_trainer) - - def _setup_gpu_args(self): - import submitit - from pathlib import Path - - job_env = submitit.JobEnvironment() - self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) - self.args.gpu = job_env.local_rank - self.args.rank = job_env.global_rank - self.args.world_size = job_env.num_tasks - print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") - - -def main(): - args = parse_args() - if args.job_dir == "": - args.job_dir = get_shared_folder() / "%j" - - # Note that the folder will depend on the job_id, to easily track experiments - executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=300) - - # cluster setup is defined by environment variables - num_gpus_per_node = args.ngpus - nodes = args.nodes - timeout_min = args.timeout - - executor.update_parameters( - #mem_gb=96 * num_gpus_per_node, # 768GB per machine - gpus_per_node=num_gpus_per_node, - tasks_per_node=num_gpus_per_node, # one task per GPU - cpus_per_task=12, # 96 cpus per machine - nodes=nodes, - timeout_min=timeout_min, # max is 60 * 72 - slurm_partition=args.partition, - slurm_signal_delay_s=120, - ) - - - executor.update_parameters(name="torchvision") - - args.dist_url = get_init_file().as_uri() - args.output_dir = args.job_dir - - trainer = Trainer(args) - job = executor.submit(trainer) - - print("Submitted job_id:", job.job_id) - - -if __name__ == "__main__": - main() diff --git a/references/classification/train.py b/references/classification/train.py index d9e2f3f7a83..a86b7f0895a 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -16,7 +16,9 @@ from torchvision.transforms.functional import InterpolationMode -def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None, scheduler=None): +def train_one_epoch( + model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None, scheduler=None +): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}")) @@ -44,7 +46,7 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, arg if args.clip_grad_norm is not None: nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() - + if scheduler is not None and args.lr_step_every_batch: scheduler.step() @@ -117,7 +119,13 @@ def _get_cache_path(filepath): def load_data(traindir, valdir, args): # Data loading code print("Loading data") - val_resize_size, val_crop_size, train_crop_size, center_crop, policy_magnitude = args.val_resize_size, args.val_crop_size, args.train_crop_size, args.train_center_crop, args.policy_magnitude + val_resize_size, val_crop_size, train_crop_size, center_crop, policy_magnitude = ( + args.val_resize_size, + args.val_crop_size, + args.train_crop_size, + args.train_center_crop, + args.policy_magnitude, + ) interpolation = InterpolationMode(args.interpolation) print("Loading training data") @@ -188,12 +196,12 @@ def load_data(traindir, valdir, args): def main(args): if args.output_dir: utils.mkdir(args.output_dir) - + if args.seed is None: # randomly choose a seed - args.seed = random.randint(0, 2 ** 32) + args.seed = random.randint(0, 2**32) utils.set_seed(args.seed) - + utils.init_distributed_mode(args) print(args) @@ -272,11 +280,11 @@ def main(args): raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD, RMSprop and AdamW are supported.") scaler = torch.cuda.amp.GradScaler() if args.amp else None - + batches_per_epoch = len(data_loader) warmup_iters = args.lr_warmup_epochs total_iters = args.epochs - + if args.lr_step_every_batch: warmup_iters *= batches_per_epoch total_iters *= batches_per_epoch @@ -391,7 +399,9 @@ def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description="PyTorch Classification Training", add_help=add_help) - parser.add_argument("--data-path", default="/datasets01_ontap/imagenet_full_size/061417/", type=str, help="dataset path") + parser.add_argument( + "--data-path", default="/datasets01_ontap/imagenet_full_size/061417/", type=str, help="dataset path" + ) parser.add_argument("--model", default="resnet18", type=str, help="model name") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( @@ -445,7 +455,9 @@ def get_args_parser(add_help=True): parser.add_argument("--lr-step-size", default=30, type=int, help="decrease lr every step-size epochs") parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma") parser.add_argument("--lr-min", default=0.0, type=float, help="minimum lr of lr schedule (default: 0.0)") - parser.add_argument("--lr-step-every-batch", action="store_true", help="decrease lr every step-size batches", default=False) + parser.add_argument( + "--lr-step-every-batch", action="store_true", help="decrease lr every step-size batches", default=False + ) parser.add_argument("--print-freq", default=10, type=int, help="print frequency") parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs") parser.add_argument("--resume", default="", type=str, help="path of checkpoint") @@ -509,7 +521,9 @@ def get_args_parser(add_help=True): "--train-crop-size", default=224, type=int, help="the random crop size used for training (default: 224)" ) parser.add_argument( - "--train-center-crop", action="store_true", help="use center crop instead of random crop for training (default: False)" + "--train-center-crop", + action="store_true", + help="use center crop instead of random crop for training (default: False)", ) parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)") parser.add_argument("--ra-sampler", action="store_true", help="whether to use Repeated Augmentation in training") @@ -517,7 +531,12 @@ def get_args_parser(add_help=True): "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)" ) parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load") - parser.add_argument("--seed", default=None, type=int, help="the seed for randomness (default: None). A `None` value means a seed will be randomly generated") + parser.add_argument( + "--seed", + default=None, + type=int, + help="the seed for randomness (default: None). A `None` value means a seed will be randomly generated", + ) return parser diff --git a/references/classification/utils.py b/references/classification/utils.py index ecd6ca7012d..4ce5164bc94 100644 --- a/references/classification/utils.py +++ b/references/classification/utils.py @@ -3,14 +3,15 @@ import errno import hashlib import os +import random import time from collections import defaultdict, deque, OrderedDict from typing import List, Optional, Tuple +import numpy as np + import torch import torch.distributed as dist -import numpy as np -import random class SmoothedValue: @@ -466,6 +467,7 @@ def _add_params(module, prefix=""): param_groups.append({"params": params[key], "weight_decay": params_weight_decay[key]}) return param_groups + def set_seed(seed: int): """ Function for setting all the RNGs to the same seed @@ -474,4 +476,4 @@ def set_seed(seed: int): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) - random.seed(seed) \ No newline at end of file + random.seed(seed) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index b0e9455a098..ba38548104e 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -1,5 +1,5 @@ -from functools import partial import math +from functools import partial from typing import Any, Callable, List, Optional, OrderedDict, Sequence, Tuple import numpy as np @@ -34,7 +34,7 @@ def __init__( stride: int, activation_fn: Callable[..., nn.Module], normalization_fn: Callable[..., nn.Module], - p_stochastic_dropout: float = 0., + p_stochastic_dropout: float = 0.0, ) -> None: super().__init__() @@ -52,11 +52,11 @@ def __init__( mid_channels = int(out_channels * expansion_ratio) sqz_channels = int(out_channels * squeeze_ratio) - + if p_stochastic_dropout: - self.stochastic_depth = StochasticDepth(p_stochastic_dropout, mode="row") # type: ignore + self.stochastic_depth = StochasticDepth(p_stochastic_dropout, mode="row") # type: ignore else: - self.stochastic_depth = nn.Identity() # type: ignore + self.stochastic_depth = nn.Identity() # type: ignore _layers = OrderedDict() _layers["pre_norm"] = normalization_fn(in_channels) @@ -161,7 +161,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: res = torch.swapaxes(x, self.a, self.b) return res - + class WindowPartition(nn.Module): """ Function that takes in a tensor of shape [B, C, H, W] and partitions it @@ -261,28 +261,27 @@ def __init__( ) # layer scale factors - self.stochastic_dropout = StochasticDepth(p_stochastic_dropout, mode='row') - + self.stochastic_dropout = StochasticDepth(p_stochastic_dropout, mode="row") def forward(self, x: Tensor) -> Tensor: B, C, H, W = x.shape - + # Undefined behavior if H or W are not divisible by p # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 torch._assert( H % self.p == 0 and W % self.p == 0, - f"H and W must be divisible by partition size. Got H: {H}, W: {W}, P: {self.p}" + f"H and W must be divisible by partition size. Got H: {H}, W: {W}, P: {self.p}", ) - + gh, gw = H // self.p, W // self.p - + x = self.partition_op(x, self.p) x = self.partition_swap(x) x = x + self.stochastic_dropout(self.attn_layer(x)) x = x + self.stochastic_dropout(self.mlp_layer(x)) x = self.departition_swap(x) x = self.departition_op(x, self.p, gh, gw) - + return x @@ -457,16 +456,10 @@ def __init__( norm_layer=normalization_fn, activation_layer=activation_fn, bias=False, - inplace=None + inplace=None, ), Conv2dNormActivation( - stem_channels, - stem_channels, - 3, - stride=1, - norm_layer=None, - activation_layer=None, - bias=True + stem_channels, stem_channels, 3, stride=1, norm_layer=None, activation_layer=None, bias=True ), ) @@ -517,7 +510,7 @@ def __init__( nn.Tanh(), nn.Linear(block_channels[-1], num_classes, bias=False), ) - + self._init_weights() def forward(self, x: Tensor) -> Tensor: @@ -526,7 +519,7 @@ def forward(self, x: Tensor) -> Tensor: x = block(x) x = self.classifier(x) return x - + def _init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): @@ -627,6 +620,7 @@ def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, ** progress=progress, **kwargs, ) - + + class MaxVit_T_Weights(WeightsEnum): - pass \ No newline at end of file + pass From 872f40f9edb4c022c87cc5cc6a81e03e2c9d3aca Mon Sep 17 00:00:00 2001 From: Ponku Date: Wed, 14 Sep 2022 18:18:57 +0100 Subject: [PATCH 07/23] mypy fix for too many arguments --- torchvision/models/maxvit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index ba38548104e..c310d16a610 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -590,7 +590,6 @@ def _maxvit( attn_dropout=attn_dropout, partition_size=partition_size, input_size=input_size, - **kwargs, ) if weights is not None: From f561edfc8f012425cb30986439eaacee2695e3cf Mon Sep 17 00:00:00 2001 From: Ponku Date: Wed, 14 Sep 2022 19:24:10 +0100 Subject: [PATCH 08/23] updated old tests --- test/test_architecture_ops.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/test/test_architecture_ops.py b/test/test_architecture_ops.py index bb79dd5a242..9f254c7942b 100644 --- a/test/test_architecture_ops.py +++ b/test/test_architecture_ops.py @@ -10,29 +10,36 @@ class MaxvitTester(unittest.TestCase): def test_maxvit_window_partition(self): input_shape = (1, 3, 224, 224) partition_size = 7 + n_partitions = input_shape[3] // partition_size x = torch.randn(input_shape) - partition = WindowPartition(partition_size=7) - departition = WindowDepartition(partition_size=partition_size, n_partitions=(input_shape[3] // partition_size)) + partition = WindowPartition() + departition = WindowDepartition() - assert torch.allclose(x, departition(partition(x))) + x_hat = partition(x, partition_size) + x_hat = departition(x_hat, partition_size, n_partitions, n_partitions) + + assert torch.allclose(x, x_hat) def test_maxvit_grid_partition(self): input_shape = (1, 3, 224, 224) partition_size = 7 + n_partitions = input_shape[3] // partition_size x = torch.randn(input_shape) - partition = torch.nn.Sequential( - WindowPartition(partition_size=(input_shape[3] // partition_size)), - SwapAxes(-2, -3), - ) - departition = torch.nn.Sequential( - SwapAxes(-2, -3), - WindowDepartition(partition_size=(input_shape[3] // partition_size), n_partitions=partition_size), - ) - - assert torch.allclose(x, departition(partition(x))) + pre_swap = SwapAxes(-2, -3) + post_swap = SwapAxes(-2, -3) + + partition = WindowPartition() + departition = WindowDepartition() + + x_hat = partition(x, n_partitions) + x_hat = pre_swap(x_hat) + x_hat = post_swap(x_hat) + x_hat = departition(x_hat, n_partitions, partition_size, partition_size) + + assert torch.allclose(x, x_hat) if __name__ == "__main__": From 314b82a9029183121690df70bc29f42e7cb245e5 Mon Sep 17 00:00:00 2001 From: Ponku Date: Fri, 16 Sep 2022 21:13:19 +0100 Subject: [PATCH 09/23] removed per batch lr scheduler and seed setting --- references/classification/train.py | 48 +++++++----------------------- references/classification/utils.py | 14 --------- 2 files changed, 11 insertions(+), 51 deletions(-) diff --git a/references/classification/train.py b/references/classification/train.py index a86b7f0895a..cf05d727189 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -1,6 +1,5 @@ import datetime import os -import random import time import warnings @@ -16,9 +15,7 @@ from torchvision.transforms.functional import InterpolationMode -def train_one_epoch( - model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None, scheduler=None -): +def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}")) @@ -47,9 +44,6 @@ def train_one_epoch( nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm) optimizer.step() - if scheduler is not None and args.lr_step_every_batch: - scheduler.step() - if model_ema and i % args.model_ema_steps == 0: model_ema.update_parameters(model) if epoch < args.lr_warmup_epochs: @@ -197,11 +191,6 @@ def main(args): if args.output_dir: utils.mkdir(args.output_dir) - if args.seed is None: - # randomly choose a seed - args.seed = random.randint(0, 2**32) - utils.set_seed(args.seed) - utils.init_distributed_mode(args) print(args) @@ -226,7 +215,10 @@ def main(args): mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha)) if mixup_transforms: mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms) - collate_fn = lambda batch: mixupcutmix(*default_collate(batch)) # noqa: E731 + + def collate_fn(batch): + return mixupcutmix(*default_collate(batch)) # noqa: E731 + data_loader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, @@ -281,20 +273,12 @@ def main(args): scaler = torch.cuda.amp.GradScaler() if args.amp else None - batches_per_epoch = len(data_loader) - warmup_iters = args.lr_warmup_epochs - total_iters = args.epochs - - if args.lr_step_every_batch: - warmup_iters *= batches_per_epoch - total_iters *= batches_per_epoch - args.lr_scheduler = args.lr_scheduler.lower() if args.lr_scheduler == "steplr": main_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) elif args.lr_scheduler == "cosineannealinglr": main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer, T_max=total_iters - warmup_iters, eta_min=args.lr_min + optimizer, T_max=args.epochs - args.lr_warmup_epochs, eta_min=args.lr_min ) elif args.lr_scheduler == "exponentiallr": main_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_gamma) @@ -307,18 +291,18 @@ def main(args): if args.lr_warmup_epochs > 0: if args.lr_warmup_method == "linear": warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR( - optimizer, start_factor=args.lr_warmup_decay, total_iters=warmup_iters + optimizer, start_factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs ) elif args.lr_warmup_method == "constant": warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR( - optimizer, factor=args.lr_warmup_decay, total_iters=warmup_iters + optimizer, factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs ) else: raise RuntimeError( f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported." ) lr_scheduler = torch.optim.lr_scheduler.SequentialLR( - optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters] + optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[args.lr_warmup_epochs] ) else: lr_scheduler = main_lr_scheduler @@ -368,9 +352,8 @@ def main(args): for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) - train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema, scaler, lr_scheduler) - if not args.lr_step_every_batch: - lr_scheduler.step() + train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema, scaler) + lr_scheduler.step() evaluate(model, criterion, data_loader_test, device=device) if model_ema: evaluate(model_ema, criterion, data_loader_test, device=device, log_suffix="EMA") @@ -455,9 +438,6 @@ def get_args_parser(add_help=True): parser.add_argument("--lr-step-size", default=30, type=int, help="decrease lr every step-size epochs") parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma") parser.add_argument("--lr-min", default=0.0, type=float, help="minimum lr of lr schedule (default: 0.0)") - parser.add_argument( - "--lr-step-every-batch", action="store_true", help="decrease lr every step-size batches", default=False - ) parser.add_argument("--print-freq", default=10, type=int, help="print frequency") parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs") parser.add_argument("--resume", default="", type=str, help="path of checkpoint") @@ -531,12 +511,6 @@ def get_args_parser(add_help=True): "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)" ) parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load") - parser.add_argument( - "--seed", - default=None, - type=int, - help="the seed for randomness (default: None). A `None` value means a seed will be randomly generated", - ) return parser diff --git a/references/classification/utils.py b/references/classification/utils.py index 4ce5164bc94..c31f3928e86 100644 --- a/references/classification/utils.py +++ b/references/classification/utils.py @@ -3,13 +3,10 @@ import errno import hashlib import os -import random import time from collections import defaultdict, deque, OrderedDict from typing import List, Optional, Tuple -import numpy as np - import torch import torch.distributed as dist @@ -466,14 +463,3 @@ def _add_params(module, prefix=""): if len(params[key]) > 0: param_groups.append({"params": params[key], "weight_decay": params_weight_decay[key]}) return param_groups - - -def set_seed(seed: int): - """ - Function for setting all the RNGs to the same seed - """ - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - np.random.seed(seed) - random.seed(seed) From a4863e9b170e87638ce2afa52dbbcb4ec60ec3f6 Mon Sep 17 00:00:00 2001 From: Ponku Date: Fri, 16 Sep 2022 21:16:55 +0100 Subject: [PATCH 10/23] removed ontap --- references/classification/train.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/references/classification/train.py b/references/classification/train.py index cf05d727189..fc900cba073 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -382,9 +382,7 @@ def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description="PyTorch Classification Training", add_help=add_help) - parser.add_argument( - "--data-path", default="/datasets01_ontap/imagenet_full_size/061417/", type=str, help="dataset path" - ) + parser.add_argument("--data-path", default="/datasets01/imagenet_full_size/061417/", type=str, help="dataset path") parser.add_argument("--model", default="resnet18", type=str, help="model name") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( From 21116807ea511f9266e3e82711d43a7b46d7f1a4 Mon Sep 17 00:00:00 2001 From: TeodorPoncu Date: Fri, 16 Sep 2022 23:54:41 +0000 Subject: [PATCH 11/23] added docs, validated weights --- torchvision/models/maxvit.py | 241 ++++++++++++++++++++++++++++++++--- 1 file changed, 222 insertions(+), 19 deletions(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index c310d16a610..81e1334208b 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -6,14 +6,21 @@ import torch import torch.nn.functional as F from torch import nn, Tensor -from torchvision.models._api import WeightsEnum +from ._api import register_model, Weights, WeightsEnum +from ._meta import _IMAGENET_CATEGORIES +from ..transforms._presets import ImageClassification, InterpolationMode from torchvision.models._utils import _ovewrite_named_param from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation from torchvision.ops.stochastic_depth import StochasticDepth from torchvision.utils import _log_api_usage_once +__all__ = [ + "MaxVit", + "MaxVit_T_Weights", + "maxvit_t", +] -def get_relative_position_index(height: int, width: int) -> torch.Tensor: +def _get_relative_position_index(height: int, width: int) -> torch.Tensor: coords = torch.stack(torch.meshgrid([torch.arange(height), torch.arange(width)])) coords_flat = torch.flatten(coords, 1) relative_coords = coords_flat[:, :, None] - coords_flat[:, None, :] @@ -23,8 +30,23 @@ def get_relative_position_index(height: int, width: int) -> torch.Tensor: relative_coords[:, :, 0] *= 2 * width - 1 return relative_coords.sum(-1) +torch.fx.wrap("_get_relative_position_index") + class MBConv(nn.Module): + """MBConv: Mobile Inverted Residual Bottleneck. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + expansion_ratio (float): Expansion ratio in the bottleneck. + squeeze_ratio (float): Squeeze ratio in the SE Layer. + stride (int): Stride of the depthwise convolution. + activation_fn (Callable[..., nn.Module]): Activation function. + normalization_fn (Callable[..., nn.Module]): Normalization function. + p_stochastic_dropout (float): Probability of stochastic depth. + """ + def __init__( self, in_channels: int, @@ -87,12 +109,26 @@ def __init__( self.layers = nn.Sequential(_layers) def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Input tensor with expected layout of [B, C, H, W]. + Returns: + Tensor: Output tensor with expected layout of [B, C, H / stride, W / stride]. + """ res = self.proj(x) x = self.stochastic_depth(self.layers(x)) return res + x class RelativePositionalMultiHeadAttention(nn.Module): + """Relative Positional Multi-Head Attention. + + Args: + feat_dim (int): Number of input features. + head_dim (int): Number of features per head. + max_seq_len (int): Maximum sequence length. + """ + def __init__( self, feat_dim: int, @@ -117,7 +153,7 @@ def __init__( torch.empty(((2 * self.size - 1) * (2 * self.size - 1), self.n_heads), dtype=torch.float32), ) - self.register_buffer("relative_position_index", get_relative_position_index(self.size, self.size)) + self.register_buffer("relative_position_index", _get_relative_position_index(self.size, self.size)) # initialize with truncated normal the bias torch.nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02) @@ -128,6 +164,12 @@ def get_relative_positional_bias(self) -> torch.Tensor: return relative_bias.unsqueeze(0) def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Input tensor with expected layout of [B, G, P, D]. + Returns: + Tensor: Output tensor with expected layout of [B, G, P, D]. + """ B, G, P, D = x.shape H, DH = self.n_heads, self.head_dim @@ -152,6 +194,8 @@ def forward(self, x: Tensor) -> Tensor: class SwapAxes(nn.Module): + """Permute the axes of a tensor.""" + def __init__(self, a: int, b: int) -> None: super().__init__() self.a = a @@ -164,14 +208,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class WindowPartition(nn.Module): """ - Function that takes in a tensor of shape [B, C, H, W] and partitions it - in to a tensor of shape [B, H/P, W/P, P*P, C] + Partition the input tensor into non-overlapping windows. """ - + def __init__(self) -> None: super().__init__() def forward(self, x: Tensor, p: int) -> Tensor: + """ + Args: + x (Tensor): Input tensor with expected layout of [B, C, H, W]. + p (int): Number of partitions. + Returns: + Tensor: Output tensor with expected layout of [B, H/P, W/P, P*P, C]. + """ B, C, H, W = x.shape P = p # chunk up H and W dimensions @@ -184,14 +234,22 @@ def forward(self, x: Tensor, p: int) -> Tensor: class WindowDepartition(nn.Module): """ - Function that takes in a tensor of shape [B, H/P, W/P, P*P, C] - and partitions it into a tensor of shape [B, C, H, W] + Departition the input tensor of non-overlapping windows into a feature volume of layout [B, C, H, W]. """ def __init__(self) -> None: super().__init__() def forward(self, x: Tensor, p: int, h_partitions: int, w_partitions: int) -> Tensor: + """ + Args: + x (Tensor): Input tensor with expected layout of [B, (H/P * W/P), P*P, C]. + p (int): Number of partitions. + h_partitions (int): Number of vertical partitions. + w_partitions (int): Number of horizontal partitions. + Returns: + Tensor: Output tensor with expected layout of [B, C, H, W]. + """ B, G, PP, C = x.shape P = p HP, WP = h_partitions, w_partitions @@ -205,6 +263,23 @@ def forward(self, x: Tensor, p: int, h_partitions: int, w_partitions: int) -> Te class PartitionAttentionLayer(nn.Module): + """ + Layer for partitioning the input tensor into non-overlapping windows and applying attention to each window. + + Args: + in_channels (int): Number of input channels. + head_dim (int): Dimension of each attention head. + partition_size (int): Size of the partitions. + partition_type (str): Type of partitioning to use. Can be either "grid" or "window". + grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into. + mlp_ratio (int): Ratio of the feature size expansion in the MLP layer. + activation_fn (Callable[..., nn.Module]): Activation function to use. + normalization_fn (Callable[..., nn.Module]): Normalization function to use. + attn_dropout (float): Dropout probability for the attention layer. + mlp_dropout (float): Dropout probability for the MLP layer. + p_stochastic_dropout (float): Probability of dropping out a partition. + """ + def __init__( self, in_channels: int, @@ -264,6 +339,12 @@ def __init__( self.stochastic_dropout = StochasticDepth(p_stochastic_dropout, mode="row") def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Input tensor with expected layout of [B, C, H, W]. + Returns: + Tensor: Output tensor with expected layout of [B, C, H, W]. + """ B, C, H, W = x.shape # Undefined behavior if H or W are not divisible by p @@ -286,6 +367,26 @@ def forward(self, x: Tensor) -> Tensor: class MaxVitLayer(nn.Module): + """ + MaxVit layer consisting of a MBConv layer followed by a PartitionAttentionLayer with `window` and a PartitionAttentionLayer with `grid`. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + expansion_ratio (float): Expansion ratio in the bottleneck. + squeeze_ratio (float): Squeeze ratio in the SE Layer. + stride (int): Stride of the depthwise convolution. + activation_fn (Callable[..., nn.Module]): Activation function. + normalization_fn (Callable[..., nn.Module]): Normalization function. + head_dim (int): Dimension of the attention heads. + mlp_ratio (int): Ratio of the MLP layer. + mlp_dropout (float): Dropout probability for the MLP layer. + attn_dropout (float): Dropout probability for the attention layer. + p_stochastic_dropout (float): Probability of stochastic depth. + partition_size (int): Size of the partitions. + grid_size (Tuple[int, int]): Size of the input feature grid. + """ + def __init__( self, # conv parameters @@ -352,11 +453,38 @@ def __init__( self.layers = nn.Sequential(layers) def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Input tensor of shape (B, C, H, W). + Returns: + Tensor: Output tensor of shape (B, C, H, W). + """ x = self.layers(x) return x class MaxVitBlock(nn.Module): + """ + A MaxVit block consisting of `n_layers` MaxVit layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + expansion_ratio (float): Expansion ratio in the bottleneck. + squeeze_ratio (float): Squeeze ratio in the SE Layer. + activation_fn (Callable[..., nn.Module]): Activation function. + normalization_fn (Callable[..., nn.Module]): Normalization function. + head_dim (int): Dimension of the attention heads. + mlp_ratio (int): Ratio of the MLP layer. + mlp_dropout (float): Dropout probability for the MLP layer. + attn_dropout (float): Dropout probability for the attention layer. + p_stochastic_dropout (float): Probability of stochastic depth. + partition_size (int): Size of the partitions. + input_grid_size (Tuple[int, int]): Size of the input feature grid. + n_layers (int): Number of layers in the block. + p_stochastic (List[float]): List of probabilities for stochastic depth for each layer. + """ + def __init__( self, # conv parameters @@ -409,12 +537,39 @@ def __init__( ] def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (Tensor): Input tensor of shape (B, C, H, W). + Returns: + Tensor: Output tensor of shape (B, C, H, W). + """ for layer in self.layers: x = layer(x) return x class MaxVit(nn.Module): + """ + Implements MaxVit Transformer from the `MaxViT: Multi-Axis Vision Transformer `_ paper. + Args: + input_size (Tuple[int, int]): Size of the input image. + input_channels (int): Number of input channels. + stem_channels (int): Number of channels in the stem. + num_classes (int): Number of classes. + block_channels (List[int]): Number of channels in each block. + block_layers (List[int]): Number of layers in each block. + stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value. + squeeze_ratio (float): Squeeze ratio in the SE Layer. + expansion_ratio (float): Expansion ratio in the MBConv bottleneck. + normalization_fn (Callable[..., nn.Module]): Normalization function. + activation_fn (Callable[..., nn.Module]): Activation function. + head_dim (int): Dimension of the attention heads. + mlp_ratio (int): Expansion ratio of the MLP layer. + mlp_dropout (float): Dropout probability for the MLP layer. + attn_dropout (float): Dropout probability for the attention layer. + partition_size (int): Size of the partitions. + """ + def __init__( self, # input size parameters @@ -534,11 +689,10 @@ def _init_weights(self): if m.bias is not None: nn.init.zeros_(m.bias) - +@register_model() def _maxvit( - # stem and task parameters + # stem parameters stem_channels: int, - num_classes: int, # block parameters block_channels: List[int], block_layers: List[int], @@ -561,14 +715,17 @@ def _maxvit( # Weights API weights: Optional[WeightsEnum], progress: bool, + # task parameters + num_classes: int = 1000, # kwargs, **kwargs, ) -> MaxVit: + print(weights) + if weights is not None: _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) assert weights.meta["min_size"][0] == weights.meta["min_size"][1] - _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"][0]) - _ovewrite_named_param(kwargs, "input_channels", weights.meta["input_channels"]) + _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"]) input_size = kwargs.pop("input_size", (224, 224)) input_channels = kwargs.pop("input_channels", 3) @@ -597,8 +754,58 @@ def _maxvit( return model +_COMMON_META = { + "categories": _IMAGENET_CATEGORIES, +} + +class MaxVit_T_Weights(WeightsEnum): + IMAGENET1K_V1 = Weights( + # URL empty until official release + url="", + transforms=partial( + ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC + ), + meta={ + **_COMMON_META, + "num_params": 30919624, + "min_size": (224, 224), + # Recipe empty until official release + "recipe": "", + "_metrics": { + "ImageNet-1K": { + "acc@1": 83.700, + "acc@5": 96.722, + } + }, + "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""", + }, + ) + DEFAULT = IMAGENET1K_V1 -def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, **kwargs: Any) -> MaxVit: +@register_model() +def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = True, **kwargs: Any) -> MaxVit: + """ + Constructs a maxvit_t architecture from + `MaxViT: Multi-Axis Vision Transformer `_. + + Args: + weights (:class:`~torchvision.models.MaxVit_T_Weights`, optional): The + pretrained weights to use. See + :class:`~torchvision.models.MaxVit_T_Weights` below for + more details, and possible values. By default, no pre-trained + weights are used. + progress (bool, optional): If True, displays a progress bar of the + download to stderr. Default is True. + **kwargs: parameters passed to the ``torchvision.models.maxvit.MaxVit`` + base class. Please refer to the `source code + `_ + for more details about this class. + + .. autoclass:: torchvision.models.MaxVit_T_Weights + :members: + """ + weights = MaxVit_T_Weights.verify(weights) + return _maxvit( stem_channels=64, block_channels=[64, 128, 256, 512], @@ -618,8 +825,4 @@ def maxvit_t(*, weights: Optional[WeightsEnum] = None, progress: bool = True, ** weights=weights, progress=progress, **kwargs, - ) - - -class MaxVit_T_Weights(WeightsEnum): - pass + ) \ No newline at end of file From cc51c2bc6474768be338516fa29c1b3fb2f349fa Mon Sep 17 00:00:00 2001 From: TeodorPoncu Date: Sat, 17 Sep 2022 13:33:43 +0000 Subject: [PATCH 12/23] fixed test expect, moved shape assertions in the begging for torch.fx compatibility --- .../ModelTester.test_maxvit_t_expect.pkl | Bin 939 -> 1078 bytes torchvision/models/maxvit.py | 54 ++++++++++++------ 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/test/expect/ModelTester.test_maxvit_t_expect.pkl b/test/expect/ModelTester.test_maxvit_t_expect.pkl index 06224d7ad33d19b768c8b058f6887886f777a6c0..3a93545f6142a03cffb439a82bac80d41c97b098 100644 GIT binary patch literal 1078 zcmWIWW@cev;NW1u0GbSz48Hj(sW~C3#U-gldL=+AJ~y$VEVCrOBtEsGAT_x}KP9mw zQLi97$IZ!}QX2ey5JP-S9zY9S*;3zu6?equ>5M3AeHDS{EG#-NZnf*D9B7nByVcpH@zvIcrHcr$t% zwH2~?8+kLf6|#3EC+Fwn09~J%pI6M~UzCzsl$zp_nOwqE$e|fj$XQaz6;#Nr7vRm# z!DMluW){#i5Dvf@QtAwX#D$c;ftwRAYB(_(?cL*Ff89=EvDQAWgX?YQBwyN_K6l67 zSsJc(!HEL<8W|M!2^w44F^1;vee?2yU76-O+q?%gb{cDSZ6zdj+S+C6+jYrC?Y-lt zwlA;O$;SCoxE)8Eg`IA#`MyTaO?&f=*6usM?2t|Ht4p@gD-YS&Je_QNBUf;jdd>d5 z3-;C670EoXT^F^$mi^Sey=7orA+@YnxphP&d%2JDp zfzjjUB!?PY7)i{S3uN=fdFi1{KwCjLz?%_7!P7Ky9BP6jPyo>>9Nk!C&#I#ssRc}j zL>a4ZfNm(VKQvGbjR5+IprO!E3-D%R(}C)iW7dV61Iia50Hc{e83qV0fx`QFuAia(dmfxFw%JyfYxBuMa4*v|zP)WxKWr-dl5Nb5&+G~0udvzE zc6blJ+6u#yFD{U6s-)^1oyJz>7(CIcCJ5KK5n%fp=qj6i*c3*_SUXwcA-R7$I ztr>rnTBmG^-cu5-YW>4L$+mob#~z;-r>r03Ub9~DU()8x%qvzBxod5Nl Tuple[int, int]: + return ( + (input_size[0] - kernel_size + 2 * padding) // stride + 1, + (input_size[1] - kernel_size + 2 * padding) // stride + 1 + ) + +def _make_block_input_shapes(input_size: Tuple[int, int], n_blocks: int) -> Tuple[int, int]: + """Util function to check that the input size is correct for a MaxVit configuration.""" + shapes = [] + block_input_shape = _get_conv_output_shape(input_size, 3, 2, 1) + for _ in range(n_blocks): + block_input_shape = _get_conv_output_shape(block_input_shape, 3, 2, 1) + shapes.append(block_input_shape) + return shapes + def _get_relative_position_index(height: int, width: int) -> torch.Tensor: coords = torch.stack(torch.meshgrid([torch.arange(height), torch.arange(width)])) coords_flat = torch.flatten(coords, 1) @@ -30,8 +45,6 @@ def _get_relative_position_index(height: int, width: int) -> torch.Tensor: relative_coords[:, :, 0] *= 2 * width - 1 return relative_coords.sum(-1) -torch.fx.wrap("_get_relative_position_index") - class MBConv(nn.Module): """MBConv: Mobile Inverted Residual Bottleneck. @@ -345,16 +358,10 @@ def forward(self, x: Tensor) -> Tensor: Returns: Tensor: Output tensor with expected layout of [B, C, H, W]. """ - B, C, H, W = x.shape - + # Undefined behavior if H or W are not divisible by p # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 - torch._assert( - H % self.p == 0 and W % self.p == 0, - f"H and W must be divisible by partition size. Got H: {H}, W: {W}, P: {self.p}", - ) - - gh, gw = H // self.p, W // self.p + gh, gw = self.grid_size[0] // self.p, self.grid_size[1] // self.p x = self.partition_op(x, self.p) x = self.partition_swap(x) @@ -463,6 +470,7 @@ def forward(self, x: Tensor) -> Tensor: return x + class MaxVitBlock(nn.Module): """ A MaxVit block consisting of `n_layers` MaxVit layers. @@ -513,7 +521,7 @@ def __init__( self.layers = nn.ModuleList() # account for the first stride of the first layer - self.grid_size = (input_grid_size[0] // 2, input_grid_size[1] // 2) + self.grid_size = _get_conv_output_shape(input_grid_size, kernel_size=3, stride=2, padding=1) for idx, p in enumerate(p_stochastic): stride = 2 if idx == 0 else 1 @@ -600,6 +608,20 @@ def __init__( ) -> None: super().__init__() _log_api_usage_once(self) + + # Make sure input size will be divisible by the partition size in all blocks + # Undefined behavior if H or W are not divisible by p + # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 + + # we do this check here to avoid torch.fx that cannot handle error checking on dynamic tensor shapes + block_input_sizes = _make_block_input_shapes(input_size, len(block_channels)) + for idx, block_input_size in enumerate(block_input_sizes): + if block_input_size[0] % partition_size != 0 or block_input_size[1] % partition_size != 0: + raise ValueError( + f"Input size {block_input_size} of block {idx} is not divisible by partition size {partition_size}. " + f"Consider changing the partition size or the input size.\n" + f"Current configuration yields the following block input sizes: {block_input_sizes}." + ) # stem self.stem = nn.Sequential( @@ -619,7 +641,7 @@ def __init__( ) # account for stem stride - input_size = (input_size[0] // 2, input_size[1] // 2) + input_size = _get_conv_output_shape(input_size, kernel_size=3, stride=2, padding=1) self.partition_size = partition_size # blocks @@ -689,7 +711,6 @@ def _init_weights(self): if m.bias is not None: nn.init.zeros_(m.bias) -@register_model() def _maxvit( # stem parameters stem_channels: int, @@ -720,7 +741,6 @@ def _maxvit( # kwargs, **kwargs, ) -> MaxVit: - print(weights) if weights is not None: _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) From d2dfe71793528132268eeada62e0ceefc1b3e2b8 Mon Sep 17 00:00:00 2001 From: TeodorPoncu Date: Sat, 17 Sep 2022 14:48:54 +0000 Subject: [PATCH 13/23] mypy fix --- torchvision/models/maxvit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 8af2d473baf..6251baaffc4 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -26,7 +26,7 @@ def _get_conv_output_shape(input_size: Tuple[int, int], kernel_size: int, stride (input_size[1] - kernel_size + 2 * padding) // stride + 1 ) -def _make_block_input_shapes(input_size: Tuple[int, int], n_blocks: int) -> Tuple[int, int]: +def _make_block_input_shapes(input_size: Tuple[int, int], n_blocks: int) -> List[Tuple[int, int]]: """Util function to check that the input size is correct for a MaxVit configuration.""" shapes = [] block_input_shape = _get_conv_output_shape(input_size, 3, 2, 1) From 328f9b67c768ef829ab9ff5ca2443d4eca4fdc05 Mon Sep 17 00:00:00 2001 From: Ponku Date: Sun, 18 Sep 2022 18:12:41 +0100 Subject: [PATCH 14/23] lint fix --- torchvision/models/maxvit.py | 50 ++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 6251baaffc4..1c7f8fcc35e 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -8,10 +8,10 @@ from torch import nn, Tensor from torchvision.models._api import register_model, Weights, WeightsEnum from torchvision.models._meta import _IMAGENET_CATEGORIES -from torchvision.transforms._presets import ImageClassification, InterpolationMode from torchvision.models._utils import _ovewrite_named_param from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation from torchvision.ops.stochastic_depth import StochasticDepth +from torchvision.transforms._presets import ImageClassification, InterpolationMode from torchvision.utils import _log_api_usage_once __all__ = [ @@ -20,12 +20,14 @@ "maxvit_t", ] + def _get_conv_output_shape(input_size: Tuple[int, int], kernel_size: int, stride: int, padding: int) -> Tuple[int, int]: return ( (input_size[0] - kernel_size + 2 * padding) // stride + 1, - (input_size[1] - kernel_size + 2 * padding) // stride + 1 + (input_size[1] - kernel_size + 2 * padding) // stride + 1, ) + def _make_block_input_shapes(input_size: Tuple[int, int], n_blocks: int) -> List[Tuple[int, int]]: """Util function to check that the input size is correct for a MaxVit configuration.""" shapes = [] @@ -35,6 +37,7 @@ def _make_block_input_shapes(input_size: Tuple[int, int], n_blocks: int) -> List shapes.append(block_input_shape) return shapes + def _get_relative_position_index(height: int, width: int) -> torch.Tensor: coords = torch.stack(torch.meshgrid([torch.arange(height), torch.arange(width)])) coords_flat = torch.flatten(coords, 1) @@ -48,7 +51,7 @@ def _get_relative_position_index(height: int, width: int) -> torch.Tensor: class MBConv(nn.Module): """MBConv: Mobile Inverted Residual Bottleneck. - + Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. @@ -59,7 +62,7 @@ class MBConv(nn.Module): normalization_fn (Callable[..., nn.Module]): Normalization function. p_stochastic_dropout (float): Probability of stochastic depth. """ - + def __init__( self, in_channels: int, @@ -135,13 +138,13 @@ def forward(self, x: Tensor) -> Tensor: class RelativePositionalMultiHeadAttention(nn.Module): """Relative Positional Multi-Head Attention. - + Args: feat_dim (int): Number of input features. head_dim (int): Number of features per head. max_seq_len (int): Maximum sequence length. """ - + def __init__( self, feat_dim: int, @@ -208,7 +211,7 @@ def forward(self, x: Tensor) -> Tensor: class SwapAxes(nn.Module): """Permute the axes of a tensor.""" - + def __init__(self, a: int, b: int) -> None: super().__init__() self.a = a @@ -223,7 +226,7 @@ class WindowPartition(nn.Module): """ Partition the input tensor into non-overlapping windows. """ - + def __init__(self) -> None: super().__init__() @@ -278,7 +281,7 @@ def forward(self, x: Tensor, p: int, h_partitions: int, w_partitions: int) -> Te class PartitionAttentionLayer(nn.Module): """ Layer for partitioning the input tensor into non-overlapping windows and applying attention to each window. - + Args: in_channels (int): Number of input channels. head_dim (int): Dimension of each attention head. @@ -292,7 +295,7 @@ class PartitionAttentionLayer(nn.Module): mlp_dropout (float): Dropout probability for the MLP layer. p_stochastic_dropout (float): Probability of dropping out a partition. """ - + def __init__( self, in_channels: int, @@ -358,7 +361,7 @@ def forward(self, x: Tensor) -> Tensor: Returns: Tensor: Output tensor with expected layout of [B, C, H, W]. """ - + # Undefined behavior if H or W are not divisible by p # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 gh, gw = self.grid_size[0] // self.p, self.grid_size[1] // self.p @@ -376,7 +379,7 @@ def forward(self, x: Tensor) -> Tensor: class MaxVitLayer(nn.Module): """ MaxVit layer consisting of a MBConv layer followed by a PartitionAttentionLayer with `window` and a PartitionAttentionLayer with `grid`. - + Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. @@ -393,7 +396,7 @@ class MaxVitLayer(nn.Module): partition_size (int): Size of the partitions. grid_size (Tuple[int, int]): Size of the input feature grid. """ - + def __init__( self, # conv parameters @@ -470,11 +473,10 @@ def forward(self, x: Tensor) -> Tensor: return x - class MaxVitBlock(nn.Module): """ A MaxVit block consisting of `n_layers` MaxVit layers. - + Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. @@ -492,7 +494,7 @@ class MaxVitBlock(nn.Module): n_layers (int): Number of layers in the block. p_stochastic (List[float]): List of probabilities for stochastic depth for each layer. """ - + def __init__( self, # conv parameters @@ -577,7 +579,7 @@ class MaxVit(nn.Module): attn_dropout (float): Dropout probability for the attention layer. partition_size (int): Size of the partitions. """ - + def __init__( self, # input size parameters @@ -608,11 +610,11 @@ def __init__( ) -> None: super().__init__() _log_api_usage_once(self) - + # Make sure input size will be divisible by the partition size in all blocks # Undefined behavior if H or W are not divisible by p # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 - + # we do this check here to avoid torch.fx that cannot handle error checking on dynamic tensor shapes block_input_sizes = _make_block_input_shapes(input_size, len(block_channels)) for idx, block_input_size in enumerate(block_input_sizes): @@ -711,6 +713,7 @@ def _init_weights(self): if m.bias is not None: nn.init.zeros_(m.bias) + def _maxvit( # stem parameters stem_channels: int, @@ -741,7 +744,7 @@ def _maxvit( # kwargs, **kwargs, ) -> MaxVit: - + if weights is not None: _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"])) assert weights.meta["min_size"][0] == weights.meta["min_size"][1] @@ -774,10 +777,12 @@ def _maxvit( return model + _COMMON_META = { "categories": _IMAGENET_CATEGORIES, } + class MaxVit_T_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # URL empty until official release @@ -802,6 +807,7 @@ class MaxVit_T_Weights(WeightsEnum): ) DEFAULT = IMAGENET1K_V1 + @register_model() def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = True, **kwargs: Any) -> MaxVit: """ @@ -825,7 +831,7 @@ def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = Tru :members: """ weights = MaxVit_T_Weights.verify(weights) - + return _maxvit( stem_channels=64, block_channels=[64, 128, 256, 512], @@ -845,4 +851,4 @@ def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = Tru weights=weights, progress=progress, **kwargs, - ) \ No newline at end of file + ) From b334b7fb5434e57724fb750afb8f4450f5707ffb Mon Sep 17 00:00:00 2001 From: Ponku Date: Sun, 18 Sep 2022 22:36:25 +0100 Subject: [PATCH 15/23] added legacy interface --- torchvision/models/maxvit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 1c7f8fcc35e..cb40b88003a 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -8,7 +8,7 @@ from torch import nn, Tensor from torchvision.models._api import register_model, Weights, WeightsEnum from torchvision.models._meta import _IMAGENET_CATEGORIES -from torchvision.models._utils import _ovewrite_named_param +from torchvision.models._utils import _ovewrite_named_param, handle_legacy_interface from torchvision.ops.misc import Conv2dNormActivation, SqueezeExcitation from torchvision.ops.stochastic_depth import StochasticDepth from torchvision.transforms._presets import ImageClassification, InterpolationMode @@ -809,6 +809,7 @@ class MaxVit_T_Weights(WeightsEnum): @register_model() +@handle_legacy_interface(weights=("pretrained", MaxVit_T_Weights.IMAGENET1K_V1)) def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = True, **kwargs: Any) -> MaxVit: """ Constructs a maxvit_t architecture from From ebb8c1660b802e7297e28981ea87efd5df4c958f Mon Sep 17 00:00:00 2001 From: Ponku Date: Tue, 20 Sep 2022 15:46:40 +0100 Subject: [PATCH 16/23] added weight link --- torchvision/models/maxvit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index cb40b88003a..20a83c3a7a5 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -786,7 +786,7 @@ def _maxvit( class MaxVit_T_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # URL empty until official release - url="", + url="https://download.pytorch.org/models/maxvit_t-bc5ab103.pth", transforms=partial( ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC ), From 20422bca0247fc8bf7c359e1bbd815de4f5a7e76 Mon Sep 17 00:00:00 2001 From: Ponku Date: Wed, 21 Sep 2022 15:55:28 +0100 Subject: [PATCH 17/23] updated docs --- docs/source/models.rst | 1 + docs/source/models/maxvit.rst | 23 +++++++++++++++++++++++ references/classification/README.md | 8 ++++++++ torchvision/models/maxvit.py | 3 +-- 4 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 docs/source/models/maxvit.rst diff --git a/docs/source/models.rst b/docs/source/models.rst index 57eda6d38a5..bd7f1a529e0 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -216,6 +216,7 @@ weights: models/shufflenetv2 models/squeezenet models/swin_transformer + models/maxvit models/vgg models/vision_transformer models/wide_resnet diff --git a/docs/source/models/maxvit.rst b/docs/source/models/maxvit.rst new file mode 100644 index 00000000000..29aaaaab334 --- /dev/null +++ b/docs/source/models/maxvit.rst @@ -0,0 +1,23 @@ +MaxVit +=============== + +.. currentmodule:: torchvision.models + +The MaxVit transformer models are based on the `MaxViT: Multi-Axis Vision Transformer `__ +paper. + + +Model builders +-------------- + +The following model builders can be used to instantiate an MaxVit model with and without pre-trained weights. +All the model builders internally rely on the ``torchvision.models.maxvit.MaxVit`` +base class. Please refer to the `source code +`_ for +more details about this class. + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + maxvit_t diff --git a/references/classification/README.md b/references/classification/README.md index e8d62134ca2..04db3837016 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -245,6 +245,14 @@ Here `$MODEL` is one of `swin_v2_t`, `swin_v2_s` or `swin_v2_b`. Note that `--val-resize-size` was optimized in a post-training step, see their `Weights` entry for the exact value. +### MaxViT +``` +torchrun --nproc_per_node=8 --n_nodes=4 train.py\ +--model $MODEL --epochs 400 --batch-size 128 --opt adamw --lr 3e-3 --weight-decay 0.05 --lr-scheduler cosineannealinglr --lr-min 1e-5 --lr-warmup-method linear --lr-warmup-epochs 32 --label-smoothing 0.1 --mixup-alpha 0.8 --clip-grad-norm 1.0 --interpolation bicubic --auto-augment ta_wide --policy-magnitude 15 --train-center-crop --model-ema --val-resize-size 224 +--val-crop-size 224 --train-crop-size 224 --amp --model-ema-steps 32 --transformer-embedding-decay 0 --sync-bn +``` +Here `$MODEL` is `maxvit_t`. +Note that `--val-resize-size` was not optimized in a post-training step. ### ShuffleNet V2 diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 20a83c3a7a5..479553de236 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -794,8 +794,7 @@ class MaxVit_T_Weights(WeightsEnum): **_COMMON_META, "num_params": 30919624, "min_size": (224, 224), - # Recipe empty until official release - "recipe": "", + "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#maxvit", "_metrics": { "ImageNet-1K": { "acc@1": 83.700, From a24e5496c3f0bac62a7d5968828ac5226fe42543 Mon Sep 17 00:00:00 2001 From: Ponku Date: Wed, 21 Sep 2022 17:26:12 +0100 Subject: [PATCH 18/23] Update references/classification/train.py Co-authored-by: Vasilis Vryniotis --- references/classification/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/references/classification/train.py b/references/classification/train.py index 1391b42db14..ef645946bf0 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -217,7 +217,7 @@ def main(args): mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms) def collate_fn(batch): - return mixupcutmix(*default_collate(batch)) # noqa: E731 + return mixupcutmix(*default_collate(batch)) data_loader = torch.utils.data.DataLoader( dataset, From bb425484e8120e52460148c44b414a6aedcdf4d7 Mon Sep 17 00:00:00 2001 From: Ponku Date: Wed, 21 Sep 2022 17:26:30 +0100 Subject: [PATCH 19/23] Update torchvision/models/maxvit.py Co-authored-by: Vasilis Vryniotis --- torchvision/models/maxvit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index 479553de236..adbc394614c 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -742,7 +742,7 @@ def _maxvit( # task parameters num_classes: int = 1000, # kwargs, - **kwargs, + **kwargs: Any, ) -> MaxVit: if weights is not None: From ed21d3d1f4d9dfc897ef39f9a296c453796c6f52 Mon Sep 17 00:00:00 2001 From: Ponku Date: Wed, 21 Sep 2022 18:23:57 +0100 Subject: [PATCH 20/23] adressed comments --- docs/source/models.rst | 2 +- references/classification/presets.py | 7 +- references/classification/train.py | 11 +- torchvision/models/maxvit.py | 167 ++++++++++++--------------- 4 files changed, 83 insertions(+), 104 deletions(-) diff --git a/docs/source/models.rst b/docs/source/models.rst index bd7f1a529e0..10618434f9b 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -207,6 +207,7 @@ weights: models/efficientnetv2 models/googlenet models/inception + models/maxvit models/mnasnet models/mobilenetv2 models/mobilenetv3 @@ -216,7 +217,6 @@ weights: models/shufflenetv2 models/squeezenet models/swin_transformer - models/maxvit models/vgg models/vision_transformer models/wide_resnet diff --git a/references/classification/presets.py b/references/classification/presets.py index 39c0a593809..c6028a3417b 100644 --- a/references/classification/presets.py +++ b/references/classification/presets.py @@ -13,7 +13,8 @@ def __init__( interpolation=InterpolationMode.BILINEAR, hflip_prob=0.5, auto_augment_policy=None, - policy_magnitude=9, + ra_magnitude=9, + augmix_severity=3, random_erase_prob=0.0, center_crop=False, ): @@ -26,11 +27,11 @@ def __init__( trans.append(transforms.RandomHorizontalFlip(hflip_prob)) if auto_augment_policy is not None: if auto_augment_policy == "ra": - trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=policy_magnitude)) + trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude)) elif auto_augment_policy == "ta_wide": trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation)) elif auto_augment_policy == "augmix": - trans.append(autoaugment.AugMix(interpolation=interpolation)) + trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity)) else: aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy) trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation)) diff --git a/references/classification/train.py b/references/classification/train.py index ef645946bf0..5cb965c79e4 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -113,12 +113,11 @@ def _get_cache_path(filepath): def load_data(traindir, valdir, args): # Data loading code print("Loading data") - val_resize_size, val_crop_size, train_crop_size, center_crop, policy_magnitude = ( + val_resize_size, val_crop_size, train_crop_size, center_crop = ( args.val_resize_size, args.val_crop_size, args.train_crop_size, args.train_center_crop, - args.policy_magnitude, ) interpolation = InterpolationMode(args.interpolation) @@ -132,6 +131,8 @@ def load_data(traindir, valdir, args): else: auto_augment_policy = getattr(args, "auto_augment", None) random_erase_prob = getattr(args, "random_erase", 0.0) + ra_magnitude = getattr(args, "ra_magnitude", 9) + augmix_severity = getattr(args, "augmix_severity", 3) dataset = torchvision.datasets.ImageFolder( traindir, presets.ClassificationPresetTrain( @@ -140,7 +141,8 @@ def load_data(traindir, valdir, args): interpolation=interpolation, auto_augment_policy=auto_augment_policy, random_erase_prob=random_erase_prob, - policy_magnitude=policy_magnitude, + ra_magnitude=ra_magnitude, + augmix_severity=augmix_severity, ), ) if args.cache_dataset: @@ -459,7 +461,8 @@ def get_args_parser(add_help=True): action="store_true", ) parser.add_argument("--auto-augment", default=None, type=str, help="auto augment policy (default: None)") - parser.add_argument("--policy-magnitude", default=9, type=int, help="magnitude of auto augment policy") + parser.add_argument("--ra-magnitude", default=None, type=int, help="magnitude of auto augment policy") + parser.add_argument("--augmix-severity", default=None, type=int, help="severity of augmix policy") parser.add_argument("--random-erase", default=0.0, type=float, help="random erasing probability (default: 0.0)") # Mixed precision training parameters diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index adbc394614c..a3e8ef172ba 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -58,8 +58,8 @@ class MBConv(nn.Module): expansion_ratio (float): Expansion ratio in the bottleneck. squeeze_ratio (float): Squeeze ratio in the SE Layer. stride (int): Stride of the depthwise convolution. - activation_fn (Callable[..., nn.Module]): Activation function. - normalization_fn (Callable[..., nn.Module]): Normalization function. + activation_layer (Callable[..., nn.Module]): Activation function. + normalization_layer (Callable[..., nn.Module]): Normalization function. p_stochastic_dropout (float): Probability of stochastic depth. """ @@ -70,8 +70,8 @@ def __init__( expansion_ratio: float, squeeze_ratio: float, stride: int, - activation_fn: Callable[..., nn.Module], - normalization_fn: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], + norm_layer: Callable[..., nn.Module], p_stochastic_dropout: float = 0.0, ) -> None: super().__init__() @@ -97,15 +97,15 @@ def __init__( self.stochastic_depth = nn.Identity() # type: ignore _layers = OrderedDict() - _layers["pre_norm"] = normalization_fn(in_channels) + _layers["pre_norm"] = norm_layer(in_channels) _layers["conv_a"] = Conv2dNormActivation( in_channels, mid_channels, kernel_size=1, stride=1, padding=0, - activation_layer=activation_fn, - norm_layer=normalization_fn, + activation_layer=activation_layer, + norm_layer=norm_layer, inplace=None, ) _layers["conv_b"] = Conv2dNormActivation( @@ -114,8 +114,8 @@ def __init__( kernel_size=3, stride=stride, padding=1, - activation_layer=activation_fn, - norm_layer=normalization_fn, + activation_layer=activation_layer, + norm_layer=norm_layer, groups=mid_channels, inplace=None, ) @@ -289,8 +289,8 @@ class PartitionAttentionLayer(nn.Module): partition_type (str): Type of partitioning to use. Can be either "grid" or "window". grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into. mlp_ratio (int): Ratio of the feature size expansion in the MLP layer. - activation_fn (Callable[..., nn.Module]): Activation function to use. - normalization_fn (Callable[..., nn.Module]): Normalization function to use. + activation_layer (Callable[..., nn.Module]): Activation function to use. + normalization_layer (Callable[..., nn.Module]): Normalization function to use. attn_dropout (float): Dropout probability for the attention layer. mlp_dropout (float): Dropout probability for the MLP layer. p_stochastic_dropout (float): Probability of dropping out a partition. @@ -307,8 +307,8 @@ def __init__( # because we need to know hamy relative offsets there are in the grid grid_size: Tuple[int, int], mlp_ratio: int, - activation_fn: Callable[..., nn.Module], - normalization_fn: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], + normalization_layer: Callable[..., nn.Module], attn_dropout: float, mlp_dropout: float, p_stochastic_dropout: float, @@ -335,7 +335,7 @@ def __init__( self.departition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity() self.attn_layer = nn.Sequential( - normalization_fn(in_channels), + normalization_layer(in_channels), # it's always going to be partition_size ** 2 because # of the axis swap in the case of grid partitioning RelativePositionalMultiHeadAttention(in_channels, head_dim, partition_size**2), @@ -346,7 +346,7 @@ def __init__( self.mlp_layer = nn.Sequential( nn.LayerNorm(in_channels), nn.Linear(in_channels, in_channels * mlp_ratio), - activation_fn(), + activation_layer(), nn.Linear(in_channels * mlp_ratio, in_channels), nn.Dropout(mlp_dropout), ) @@ -365,6 +365,12 @@ def forward(self, x: Tensor) -> Tensor: # Undefined behavior if H or W are not divisible by p # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 gh, gw = self.grid_size[0] // self.p, self.grid_size[1] // self.p + torch._assert( + self.grid_size[0] % self.p == 0 and self.grid_size[1] % self.p == 0, + "Grid size must be divisible by partition size. Got grid size of {} and partition size of {}".format( + self.grid_size, self.p + ), + ) x = self.partition_op(x, self.p) x = self.partition_swap(x) @@ -386,8 +392,8 @@ class MaxVitLayer(nn.Module): expansion_ratio (float): Expansion ratio in the bottleneck. squeeze_ratio (float): Squeeze ratio in the SE Layer. stride (int): Stride of the depthwise convolution. - activation_fn (Callable[..., nn.Module]): Activation function. - normalization_fn (Callable[..., nn.Module]): Normalization function. + activation_layer (Callable[..., nn.Module]): Activation function. + normalization_layer (Callable[..., nn.Module]): Normalization function. head_dim (int): Dimension of the attention heads. mlp_ratio (int): Ratio of the MLP layer. mlp_dropout (float): Dropout probability for the MLP layer. @@ -406,8 +412,8 @@ def __init__( expansion_ratio: float, stride: int, # conv + transformer parameters - normalization_fn: Callable[..., nn.Module], - activation_fn: Callable[..., nn.Module], + normalization_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], # transformer parameters head_dim: int, mlp_ratio: int, @@ -429,8 +435,8 @@ def __init__( expansion_ratio=expansion_ratio, squeeze_ratio=squeeze_ratio, stride=stride, - activation_fn=activation_fn, - normalization_fn=normalization_fn, + activation_layer=activation_layer, + norm_layer=normalization_layer, p_stochastic_dropout=p_stochastic_dropout, ) # attention layers, block -> grid @@ -441,8 +447,8 @@ def __init__( partition_type="window", grid_size=grid_size, mlp_ratio=mlp_ratio, - activation_fn=activation_fn, - normalization_fn=nn.LayerNorm, + activation_layer=activation_layer, + normalization_layer=nn.LayerNorm, attn_dropout=attn_dropout, mlp_dropout=mlp_dropout, p_stochastic_dropout=p_stochastic_dropout, @@ -454,8 +460,8 @@ def __init__( partition_type="grid", grid_size=grid_size, mlp_ratio=mlp_ratio, - activation_fn=activation_fn, - normalization_fn=nn.LayerNorm, + activation_layer=activation_layer, + normalization_layer=nn.LayerNorm, attn_dropout=attn_dropout, mlp_dropout=mlp_dropout, p_stochastic_dropout=p_stochastic_dropout, @@ -482,8 +488,8 @@ class MaxVitBlock(nn.Module): out_channels (int): Number of output channels. expansion_ratio (float): Expansion ratio in the bottleneck. squeeze_ratio (float): Squeeze ratio in the SE Layer. - activation_fn (Callable[..., nn.Module]): Activation function. - normalization_fn (Callable[..., nn.Module]): Normalization function. + activation_layer (Callable[..., nn.Module]): Activation function. + normalization_layer (Callable[..., nn.Module]): Normalization function. head_dim (int): Dimension of the attention heads. mlp_ratio (int): Ratio of the MLP layer. mlp_dropout (float): Dropout probability for the MLP layer. @@ -503,8 +509,8 @@ def __init__( squeeze_ratio: float, expansion_ratio: float, # conv + transformer parameters - normalization_fn: Callable[..., nn.Module], - activation_fn: Callable[..., nn.Module], + normalization_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], # transformer parameters head_dim: int, mlp_ratio: int, @@ -534,8 +540,8 @@ def __init__( squeeze_ratio=squeeze_ratio, expansion_ratio=expansion_ratio, stride=stride, - normalization_fn=normalization_fn, - activation_fn=activation_fn, + normalization_layer=normalization_layer, + activation_layer=activation_layer, head_dim=head_dim, mlp_ratio=mlp_ratio, mlp_dropout=mlp_dropout, @@ -571,8 +577,8 @@ class MaxVit(nn.Module): stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value. squeeze_ratio (float): Squeeze ratio in the SE Layer. expansion_ratio (float): Expansion ratio in the MBConv bottleneck. - normalization_fn (Callable[..., nn.Module]): Normalization function. - activation_fn (Callable[..., nn.Module]): Activation function. + normalization_layer (Callable[..., nn.Module]): Normalization function. + activation_layer (Callable[..., nn.Module]): Activation function. head_dim (int): Dimension of the attention heads. mlp_ratio (int): Expansion ratio of the MLP layer. mlp_dropout (float): Dropout probability for the MLP layer. @@ -585,37 +591,42 @@ def __init__( # input size parameters input_size: Tuple[int, int], # stem and task parameters - input_channels: int, stem_channels: int, num_classes: int, + # partitioning parameters + partition_size: int, # block parameters block_channels: List[int], block_layers: List[int], + # attention head dimensions + head_dim: int, stochastic_depth_prob: float, - # conv parameters - squeeze_ratio: float, - expansion_ratio: float, # conv + transformer parameters - # normalization_fn is applied only to the conv layers - # activation_fn is applied both to conv and transformer layers - normalization_fn: Callable[..., nn.Module], - activation_fn: Callable[..., nn.Module], + # normalization_layer is applied only to the conv layers + # activation_layer is applied both to conv and transformer layers + normalization_layer: Optional[Callable[..., nn.Module]] = None, + activation_layer: Callable[..., nn.Module] = nn.GELU, + # conv parameters + squeeze_ratio: float = 0.25, + expansion_ratio: float = 4, # transformer parameters - head_dim: int, - mlp_ratio: int, - mlp_dropout: float, - attn_dropout: float, - # partitioning parameters - partition_size: int, + mlp_ratio: int = 4, + mlp_dropout: float = 0.0, + attn_dropout: float = 0.0, ) -> None: super().__init__() _log_api_usage_once(self) + input_channels = 3 + + # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1029-L1030 + # for the exact parameters used in batchnorm + if normalization_layer is None: + normalization_layer = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.99) + # Make sure input size will be divisible by the partition size in all blocks # Undefined behavior if H or W are not divisible by p # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L766 - - # we do this check here to avoid torch.fx that cannot handle error checking on dynamic tensor shapes block_input_sizes = _make_block_input_shapes(input_size, len(block_channels)) for idx, block_input_size in enumerate(block_input_sizes): if block_input_size[0] % partition_size != 0 or block_input_size[1] % partition_size != 0: @@ -632,8 +643,8 @@ def __init__( stem_channels, 3, stride=2, - norm_layer=normalization_fn, - activation_layer=activation_fn, + norm_layer=normalization_layer, + activation_layer=activation_layer, bias=False, inplace=None, ), @@ -664,8 +675,8 @@ def __init__( out_channels=out_channel, squeeze_ratio=squeeze_ratio, expansion_ratio=expansion_ratio, - normalization_fn=normalization_fn, - activation_fn=activation_fn, + normalization_layer=normalization_layer, + activation_layer=activation_layer, head_dim=head_dim, mlp_ratio=mlp_ratio, mlp_dropout=mlp_dropout, @@ -721,26 +732,13 @@ def _maxvit( block_channels: List[int], block_layers: List[int], stochastic_depth_prob: float, - # conv parameters - squeeze_ratio: float, - expansion_ratio: float, - # conv + transformer parameters - # normalization_fn is applied only to the conv layers - # activation_fn is applied both to conv and transformer layers - normalization_fn: Callable[..., nn.Module], - activation_fn: Callable[..., nn.Module], - # transformer parameters - head_dim: int, - mlp_ratio: int, - mlp_dropout: float, - attn_dropout: float, # partitioning parameters partition_size: int, + # transformer parameters + head_dim: int, # Weights API - weights: Optional[WeightsEnum], - progress: bool, - # task parameters - num_classes: int = 1000, + weights: Optional[WeightsEnum] = None, + progress: bool = False, # kwargs, **kwargs: Any, ) -> MaxVit: @@ -751,25 +749,16 @@ def _maxvit( _ovewrite_named_param(kwargs, "input_size", weights.meta["min_size"]) input_size = kwargs.pop("input_size", (224, 224)) - input_channels = kwargs.pop("input_channels", 3) model = MaxVit( - input_channels=input_channels, stem_channels=stem_channels, - num_classes=num_classes, block_channels=block_channels, block_layers=block_layers, stochastic_depth_prob=stochastic_depth_prob, - squeeze_ratio=squeeze_ratio, - expansion_ratio=expansion_ratio, - normalization_fn=normalization_fn, - activation_fn=activation_fn, head_dim=head_dim, - mlp_ratio=mlp_ratio, - mlp_dropout=mlp_dropout, - attn_dropout=attn_dropout, partition_size=partition_size, input_size=input_size, + **kwargs, ) if weights is not None: @@ -778,11 +767,6 @@ def _maxvit( return model -_COMMON_META = { - "categories": _IMAGENET_CATEGORIES, -} - - class MaxVit_T_Weights(WeightsEnum): IMAGENET1K_V1 = Weights( # URL empty until official release @@ -791,7 +775,7 @@ class MaxVit_T_Weights(WeightsEnum): ImageClassification, crop_size=224, resize_size=224, interpolation=InterpolationMode.BICUBIC ), meta={ - **_COMMON_META, + "categories": _IMAGENET_CATEGORIES, "num_params": 30919624, "min_size": (224, 224), "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#maxvit", @@ -836,17 +820,8 @@ def maxvit_t(*, weights: Optional[MaxVit_T_Weights] = None, progress: bool = Tru stem_channels=64, block_channels=[64, 128, 256, 512], block_layers=[2, 2, 5, 2], - stochastic_depth_prob=0.2, - squeeze_ratio=0.25, - expansion_ratio=4.0, - # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1029-L1030 - # for the exact parameters used in batchnorm - normalization_fn=partial(nn.BatchNorm2d, eps=1e-3, momentum=0.99), - activation_fn=nn.GELU, head_dim=32, - mlp_ratio=4, - mlp_dropout=0.0, - attn_dropout=0.0, + stochastic_depth_prob=0.2, partition_size=7, weights=weights, progress=progress, From 521d6d59cf123d70dae25915964718899ea520f2 Mon Sep 17 00:00:00 2001 From: Ponku Date: Thu, 22 Sep 2022 14:56:14 +0100 Subject: [PATCH 21/23] update ra_maginuted and augmix_severity default values --- references/classification/train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/references/classification/train.py b/references/classification/train.py index 5cb965c79e4..f359739b113 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -131,8 +131,8 @@ def load_data(traindir, valdir, args): else: auto_augment_policy = getattr(args, "auto_augment", None) random_erase_prob = getattr(args, "random_erase", 0.0) - ra_magnitude = getattr(args, "ra_magnitude", 9) - augmix_severity = getattr(args, "augmix_severity", 3) + ra_magnitude = args.ra_magnitude + augmix_severity = args.augmix_severity dataset = torchvision.datasets.ImageFolder( traindir, presets.ClassificationPresetTrain( @@ -461,8 +461,8 @@ def get_args_parser(add_help=True): action="store_true", ) parser.add_argument("--auto-augment", default=None, type=str, help="auto augment policy (default: None)") - parser.add_argument("--ra-magnitude", default=None, type=int, help="magnitude of auto augment policy") - parser.add_argument("--augmix-severity", default=None, type=int, help="severity of augmix policy") + parser.add_argument("--ra-magnitude", default=9, type=int, help="magnitude of auto augment policy") + parser.add_argument("--augmix-severity", default=3, type=int, help="severity of augmix policy") parser.add_argument("--random-erase", default=0.0, type=float, help="random erasing probability (default: 0.0)") # Mixed precision training parameters From 97cbcd8892f80811c97ddb6237c1f19aeca4cf91 Mon Sep 17 00:00:00 2001 From: TeodorPoncu Date: Thu, 22 Sep 2022 15:06:04 +0000 Subject: [PATCH 22/23] adressed some comments --- torchvision/models/maxvit.py | 81 ++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index a3e8ef172ba..ad209130040 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -59,7 +59,7 @@ class MBConv(nn.Module): squeeze_ratio (float): Squeeze ratio in the SE Layer. stride (int): Stride of the depthwise convolution. activation_layer (Callable[..., nn.Module]): Activation function. - normalization_layer (Callable[..., nn.Module]): Normalization function. + norm_layer (Callable[..., nn.Module]): Normalization function. p_stochastic_dropout (float): Probability of stochastic depth. """ @@ -290,8 +290,8 @@ class PartitionAttentionLayer(nn.Module): grid_size (Tuple[int, int]): Size of the grid to partition the input tensor into. mlp_ratio (int): Ratio of the feature size expansion in the MLP layer. activation_layer (Callable[..., nn.Module]): Activation function to use. - normalization_layer (Callable[..., nn.Module]): Normalization function to use. - attn_dropout (float): Dropout probability for the attention layer. + norm_layer (Callable[..., nn.Module]): Normalization function to use. + attention_dropout (float): Dropout probability for the attention layer. mlp_dropout (float): Dropout probability for the MLP layer. p_stochastic_dropout (float): Probability of dropping out a partition. """ @@ -308,8 +308,8 @@ def __init__( grid_size: Tuple[int, int], mlp_ratio: int, activation_layer: Callable[..., nn.Module], - normalization_layer: Callable[..., nn.Module], - attn_dropout: float, + norm_layer: Callable[..., nn.Module], + attention_dropout: float, mlp_dropout: float, p_stochastic_dropout: float, ) -> None: @@ -335,11 +335,11 @@ def __init__( self.departition_swap = SwapAxes(-2, -3) if partition_type == "grid" else nn.Identity() self.attn_layer = nn.Sequential( - normalization_layer(in_channels), + norm_layer(in_channels), # it's always going to be partition_size ** 2 because # of the axis swap in the case of grid partitioning RelativePositionalMultiHeadAttention(in_channels, head_dim, partition_size**2), - nn.Dropout(attn_dropout), + nn.Dropout(attention_dropout), ) # pre-normalization similar to transformer layers @@ -393,11 +393,11 @@ class MaxVitLayer(nn.Module): squeeze_ratio (float): Squeeze ratio in the SE Layer. stride (int): Stride of the depthwise convolution. activation_layer (Callable[..., nn.Module]): Activation function. - normalization_layer (Callable[..., nn.Module]): Normalization function. + norm_layer (Callable[..., nn.Module]): Normalization function. head_dim (int): Dimension of the attention heads. mlp_ratio (int): Ratio of the MLP layer. mlp_dropout (float): Dropout probability for the MLP layer. - attn_dropout (float): Dropout probability for the attention layer. + attention_dropout (float): Dropout probability for the attention layer. p_stochastic_dropout (float): Probability of stochastic depth. partition_size (int): Size of the partitions. grid_size (Tuple[int, int]): Size of the input feature grid. @@ -412,13 +412,13 @@ def __init__( expansion_ratio: float, stride: int, # conv + transformer parameters - normalization_layer: Callable[..., nn.Module], + norm_layer: Callable[..., nn.Module], activation_layer: Callable[..., nn.Module], # transformer parameters head_dim: int, mlp_ratio: int, mlp_dropout: float, - attn_dropout: float, + attention_dropout: float, p_stochastic_dropout: float, # partitioning parameters partition_size: int, @@ -436,7 +436,7 @@ def __init__( squeeze_ratio=squeeze_ratio, stride=stride, activation_layer=activation_layer, - norm_layer=normalization_layer, + norm_layer=norm_layer, p_stochastic_dropout=p_stochastic_dropout, ) # attention layers, block -> grid @@ -448,8 +448,8 @@ def __init__( grid_size=grid_size, mlp_ratio=mlp_ratio, activation_layer=activation_layer, - normalization_layer=nn.LayerNorm, - attn_dropout=attn_dropout, + norm_layer=nn.LayerNorm, + attention_dropout=attention_dropout, mlp_dropout=mlp_dropout, p_stochastic_dropout=p_stochastic_dropout, ) @@ -461,8 +461,8 @@ def __init__( grid_size=grid_size, mlp_ratio=mlp_ratio, activation_layer=activation_layer, - normalization_layer=nn.LayerNorm, - attn_dropout=attn_dropout, + norm_layer=nn.LayerNorm, + attention_dropout=attention_dropout, mlp_dropout=mlp_dropout, p_stochastic_dropout=p_stochastic_dropout, ) @@ -489,11 +489,11 @@ class MaxVitBlock(nn.Module): expansion_ratio (float): Expansion ratio in the bottleneck. squeeze_ratio (float): Squeeze ratio in the SE Layer. activation_layer (Callable[..., nn.Module]): Activation function. - normalization_layer (Callable[..., nn.Module]): Normalization function. + norm_layer (Callable[..., nn.Module]): Normalization function. head_dim (int): Dimension of the attention heads. mlp_ratio (int): Ratio of the MLP layer. mlp_dropout (float): Dropout probability for the MLP layer. - attn_dropout (float): Dropout probability for the attention layer. + attention_dropout (float): Dropout probability for the attention layer. p_stochastic_dropout (float): Probability of stochastic depth. partition_size (int): Size of the partitions. input_grid_size (Tuple[int, int]): Size of the input feature grid. @@ -509,13 +509,13 @@ def __init__( squeeze_ratio: float, expansion_ratio: float, # conv + transformer parameters - normalization_layer: Callable[..., nn.Module], + norm_layer: Callable[..., nn.Module], activation_layer: Callable[..., nn.Module], # transformer parameters head_dim: int, mlp_ratio: int, mlp_dropout: float, - attn_dropout: float, + attention_dropout: float, # partitioning parameters partition_size: int, input_grid_size: Tuple[int, int], @@ -540,12 +540,12 @@ def __init__( squeeze_ratio=squeeze_ratio, expansion_ratio=expansion_ratio, stride=stride, - normalization_layer=normalization_layer, + norm_layer=norm_layer, activation_layer=activation_layer, head_dim=head_dim, mlp_ratio=mlp_ratio, mlp_dropout=mlp_dropout, - attn_dropout=attn_dropout, + attention_dropout=attention_dropout, partition_size=partition_size, grid_size=self.grid_size, p_stochastic_dropout=p, @@ -571,19 +571,19 @@ class MaxVit(nn.Module): input_size (Tuple[int, int]): Size of the input image. input_channels (int): Number of input channels. stem_channels (int): Number of channels in the stem. - num_classes (int): Number of classes. + partition_size (int): Size of the partitions. block_channels (List[int]): Number of channels in each block. block_layers (List[int]): Number of layers in each block. stochastic_depth_prob (float): Probability of stochastic depth. Expands to a list of probabilities for each layer that scales linearly to the specified value. - squeeze_ratio (float): Squeeze ratio in the SE Layer. - expansion_ratio (float): Expansion ratio in the MBConv bottleneck. - normalization_layer (Callable[..., nn.Module]): Normalization function. - activation_layer (Callable[..., nn.Module]): Activation function. + squeeze_ratio (float): Squeeze ratio in the SE Layer. Default: 0.25. + expansion_ratio (float): Expansion ratio in the MBConv bottleneck. Default: 4. + norm_layer (Callable[..., nn.Module]): Normalization function. Default: None (setting to None will produce a `BatchNorm2d(eps=1e-3, momentum=0.99)`). + activation_layer (Callable[..., nn.Module]): Activation function Default: nn.GELU. head_dim (int): Dimension of the attention heads. - mlp_ratio (int): Expansion ratio of the MLP layer. - mlp_dropout (float): Dropout probability for the MLP layer. - attn_dropout (float): Dropout probability for the attention layer. - partition_size (int): Size of the partitions. + mlp_ratio (int): Expansion ratio of the MLP layer. Default: 4. + mlp_dropout (float): Dropout probability for the MLP layer. Default: 0.0. + attention_dropout (float): Dropout probability for the attention layer. Default: 0.0. + num_classes (int): Number of classes. Default: 1000. """ def __init__( @@ -592,7 +592,6 @@ def __init__( input_size: Tuple[int, int], # stem and task parameters stem_channels: int, - num_classes: int, # partitioning parameters partition_size: int, # block parameters @@ -602,9 +601,9 @@ def __init__( head_dim: int, stochastic_depth_prob: float, # conv + transformer parameters - # normalization_layer is applied only to the conv layers + # norm_layer is applied only to the conv layers # activation_layer is applied both to conv and transformer layers - normalization_layer: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, activation_layer: Callable[..., nn.Module] = nn.GELU, # conv parameters squeeze_ratio: float = 0.25, @@ -612,7 +611,9 @@ def __init__( # transformer parameters mlp_ratio: int = 4, mlp_dropout: float = 0.0, - attn_dropout: float = 0.0, + attention_dropout: float = 0.0, + # task parameters + num_classes: int = 1000, ) -> None: super().__init__() _log_api_usage_once(self) @@ -621,8 +622,8 @@ def __init__( # https://github.com/google-research/maxvit/blob/da76cf0d8a6ec668cc31b399c4126186da7da944/maxvit/models/maxvit.py#L1029-L1030 # for the exact parameters used in batchnorm - if normalization_layer is None: - normalization_layer = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.99) + if norm_layer is None: + norm_layer = partial(nn.BatchNorm2d, eps=1e-3, momentum=0.99) # Make sure input size will be divisible by the partition size in all blocks # Undefined behavior if H or W are not divisible by p @@ -643,7 +644,7 @@ def __init__( stem_channels, 3, stride=2, - norm_layer=normalization_layer, + norm_layer=norm_layer, activation_layer=activation_layer, bias=False, inplace=None, @@ -675,12 +676,12 @@ def __init__( out_channels=out_channel, squeeze_ratio=squeeze_ratio, expansion_ratio=expansion_ratio, - normalization_layer=normalization_layer, + norm_layer=norm_layer, activation_layer=activation_layer, head_dim=head_dim, mlp_ratio=mlp_ratio, mlp_dropout=mlp_dropout, - attn_dropout=attn_dropout, + attention_dropout=attention_dropout, partition_size=partition_size, input_grid_size=input_size, n_layers=num_layers, From 6b00ca8d83fe7e524ff8d4ba4ce96b054b3e3c98 Mon Sep 17 00:00:00 2001 From: TeodorPoncu Date: Fri, 23 Sep 2022 00:03:19 +0000 Subject: [PATCH 23/23] remove input_channels parameter --- torchvision/models/maxvit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py index ad209130040..7bf92876385 100644 --- a/torchvision/models/maxvit.py +++ b/torchvision/models/maxvit.py @@ -569,7 +569,6 @@ class MaxVit(nn.Module): Implements MaxVit Transformer from the `MaxViT: Multi-Axis Vision Transformer `_ paper. Args: input_size (Tuple[int, int]): Size of the input image. - input_channels (int): Number of input channels. stem_channels (int): Number of channels in the stem. partition_size (int): Size of the partitions. block_channels (List[int]): Number of channels in each block.