@@ -1216,7 +1216,8 @@ def vit_base_patch32_224_clip_laion2b(pretrained=False, **kwargs):
12161216 """ ViT-B/32
12171217 Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12181218 """
1219- model_kwargs = dict (patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True , ** kwargs )
1219+ model_kwargs = dict (
1220+ patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
12201221 model = _create_vision_transformer ('vit_base_patch32_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
12211222 return model
12221223
@@ -1226,7 +1227,8 @@ def vit_large_patch14_224_clip_laion2b(pretrained=False, **kwargs):
12261227 """ ViT-Large model (ViT-L/14)
12271228 Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12281229 """
1229- model_kwargs = dict (patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True , ** kwargs )
1230+ model_kwargs = dict (
1231+ patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
12301232 model = _create_vision_transformer ('vit_large_patch14_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
12311233 return model
12321234
@@ -1236,7 +1238,8 @@ def vit_huge_patch14_224_clip_laion2b(pretrained=False, **kwargs):
12361238 """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
12371239 Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12381240 """
1239- model_kwargs = dict (patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True , ** kwargs )
1241+ model_kwargs = dict (
1242+ patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
12401243 model = _create_vision_transformer ('vit_huge_patch14_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
12411244 return model
12421245
@@ -1247,6 +1250,7 @@ def vit_giant_patch14_224_clip_laion2b(pretrained=False, **kwargs):
12471250 Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12481251 """
12491252 model_kwargs = dict (
1250- patch_size = 14 , embed_dim = 1408 , mlp_ratio = 48 / 11 , depth = 40 , num_heads = 16 , pre_norm = True , ** kwargs )
1253+ patch_size = 14 , embed_dim = 1408 , mlp_ratio = 48 / 11 , depth = 40 , num_heads = 16 ,
1254+ pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
12511255 model = _create_vision_transformer ('vit_giant_patch14_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
12521256 return model
0 commit comments