Merge pull request #10 from unum-cloud/dev

ashvardanian · web-flow · commit 0f7d58eecbad · 2023-03-14T13:17:45.000+04:00
Add: Support for batch image preprocessing
Refactor: UForm PEP8 formatting
diff --git a/README.md b/README.md
@@ -68,7 +68,7 @@ from PIL import Image
 text = 'a small red panda in a zoo'
 image = Image.open('red_panda.jpg')
 
-image_data = model.preprocess_image(image).unsqueeze(0)
+image_data = model.preprocess_image(image)
 text_data = model.preprocess_text(text)
 
 image_embedding = model.encode_image(image_data)
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "uform"
-version = "0.0.5"
+version = "0.0.6"
 authors = [
   { name="Mikhail Kim", email="mike.kim@unum.cloud" },
   { name="Vladimir Orshulevich", email="vladimir.orshulevich@unum.cloud" },
diff --git a/src/uform.py b/src/uform.py
@@ -13,22 +13,31 @@
 
 
 class VisualEncoder(nn.Module):
-    def __init__(self,
-                 backbone,
-                 dim,
-                 output_dim,
-                 backbone_type,
-                 pooling='cls'):
+    def __init__(
+        self,
+        backbone,
+        dim,
+        output_dim,
+        backbone_type,
+        pooling='cls',
+    ):
 
         super().__init__()
-        self.encoder = timm.create_model(backbone,
-                                         pretrained=False,
-                                         num_classes=0)
+        self.encoder = timm.create_model(
+            backbone,
+            pretrained=False,
+            num_classes=0,
+        )
         self.backbone_type = backbone_type
         self.pooling = pooling
 
         if self.pooling == 'attention':
-            self.attention_pooling = nn.MultiheadAttention(dim, 1, batch_first=True, dropout=0.1)
+            self.attention_pooling = nn.MultiheadAttention(
+                dim,
+                1,
+                batch_first=True,
+                dropout=0.1,
+            )
             self.queries = nn.Parameter(torch.randn(1, 197, dim))
 
         self.proj = nn.Linear(dim, output_dim, bias=False)
@@ -53,7 +62,11 @@ def forward_features(self, x):
             features = self.forward_features_conv(x)
 
         if self.pooling == 'attention':
-            return self.attention_pooling(self.queries.expand(x.shape[0], -1, -1), features, features)[0]
+            return self.attention_pooling(
+                self.queries.expand(x.shape[0], -1, -1),
+                features,
+                features,
+            )[0]
 
         return features
 
@@ -64,14 +77,14 @@ def forward_features_vit(self, x):
         x = self.encoder.patch_embed(x)
         x = self.encoder._pos_embed(x)
         x = self.encoder.norm_pre(x)
-        
+
         for block in self.encoder.blocks:
             x = block(x)
-            
+
         x = self.encoder.norm(x)
-            
-        return x       
-        
+
+        return x
+
     def get_embedding(self, x, project=True):
         if isinstance(x, list):
             x = x[-1]
@@ -80,29 +93,31 @@ def get_embedding(self, x, project=True):
             x = x[:, 0]
         elif self.pooling == 'mean':
             x = x.mean(dim=1)
-        
+
         if project:
             return self.proj(x)
 
         return x
 
 
 class TextEncoder(nn.Module):
-    def __init__(self,
-                 backbone,
-                 backbone_type,
-                 unimodal_n_layers,
-                 context_dim,
-                 dim,
-                 output_dim,
-                 pooling='cls',
-                 head_one_neuron=False):
+    def __init__(
+        self,
+        backbone,
+        backbone_type,
+        unimodal_n_layers,
+        context_dim,
+        dim,
+        output_dim,
+        pooling='cls',
+        head_one_neuron=False,
+    ):
 
         super().__init__()
         self.backbone = TextEncoderBackbone(
             backbone,
             backbone_type,
-            unimodal_n_layers
+            unimodal_n_layers,
         )
 
         if context_dim != dim:
@@ -118,26 +133,31 @@ def __init__(self,
     def forward(self, x, attention_mask, causal=False):
         features = self.forward_unimodal(x, attention_mask, causal)
         return features, self.get_embedding(features, attention_mask)
-        
+
     def forward_unimodal(self, x, attention_mask, causal=False):
-        prep_attention_mask = self.prepare_attention_mask(attention_mask, causal)
+        prep_attention_mask = self.prepare_attention_mask(
+            attention_mask,
+            causal,
+        )
         x = self.backbone.embeddings(x)
 
         for layer in self.backbone.unimodal_encoder:
             x = layer(x, prep_attention_mask)[0]
-        
+
         return x
 
     def forward_multimodal(
         self,
         x,
         attention_mask,
         context,
-        causal=False):
-
-        prep_attention_mask = self.prepare_attention_mask(attention_mask, causal)
+        causal=False,
+    ):
+        prep_attention_mask = self.prepare_attention_mask(
+            attention_mask,
+            causal,
+        )
         context = self.context_proj(context)
-
         for layer in self.backbone.multimodal_encoder:
             x, _, _ = layer(x, prep_attention_mask, context)
 
@@ -147,15 +167,14 @@ def get_matching_scores(
         self,
         x,
         attention_mask,
-        context):
-
+        context,
+    ):
         embeddings = self.forward_multimodal(
             x,
             attention_mask,
             context,
-            False
+            False,
         )
-
         return self._logit_and_norm(embeddings)
 
     def _logit_and_norm(self, embeddings):
@@ -170,18 +189,21 @@ def get_embedding(self, x, attention_mask, project=True):
             mask_expanded = attention_mask.unsqueeze(2)
             vec_sum = (x * mask_expanded).sum(dim=1)
             x = vec_sum / mask_expanded.sum(dim=1)
+
         elif self.pooling == 'cls':
             x = x[:, 0]
 
         if project:
             return self.proj(x)
 
         return x
-    
+
     def prepare_attention_mask(self, mask, causal=False):
         if causal:
-            causal_mask = torch.ones(mask.size(1), mask.size(1), device=mask.device).tril()
-            mask = mask[:, None, :] * causal_mask[None, :, :] # bs x seq_len x seq_len
+            causal_mask = torch.ones(
+                mask.size(1), mask.size(1), device=mask.device).tril()
+            # bs x seq_len x seq_len
+            mask = mask[:, None, :] * causal_mask[None, :, :]
             mask = (1 - mask) * -10e9
             return mask[:, None]
 
@@ -195,6 +217,7 @@ class TextEncoderBackbone(nn.Module):
         'roberta': (RobertaConfig, RobertaModel, RobertaAttention),
         'xlm_roberta': (XLMRobertaConfig, XLMRobertaModel, XLMRobertaAttention)
     }
+
     def __init__(
         self,
         pretrained,
@@ -204,19 +227,23 @@ def __init__(
         super().__init__()
         self.unimodal_n_layers = unimodal_n_layers
 
-        config_file = hf_hub_download(repo_id=pretrained, filename='config.json')
+        config_file = hf_hub_download(
+            repo_id=pretrained,
+            filename='config.json',
+        )
         config_cls, model_cls, attention_layer_cls = self.type2classes[backbone_type]
         config = config_cls.from_json_file(config_file)
         model = model_cls(config)
-            
+
         self.construct_model(model, attention_layer_cls, config)
 
     def construct_model(
         self,
         backbone,
         attention_layer_cls,
-        config):
-        
+        config,
+    ):
+
         self.unimodal_encoder = backbone.encoder.layer[:self.unimodal_n_layers]
         self.embeddings = backbone.embeddings
         self.multimodal_encoder = []
@@ -226,27 +253,33 @@ def construct_model(
                 FusedTransformerLayer(
                     config,
                     attention_layer_cls,
-                    layer)
+                    layer,
                 )
+            )
 
         self.multimodal_encoder = nn.ModuleList(self.multimodal_encoder)
 
+
 class FusedTransformerLayer(nn.Module):
     def __init__(self, config, attention_layer_cls, base_layer):
         super().__init__()
 
-        self.self_attention = base_layer.attention    
+        self.self_attention = base_layer.attention
         self.intermediate = base_layer.intermediate
         self.output = base_layer.output
         self.cross_attention = attention_layer_cls(config)
-        
+
     def forward(self, x, attention_mask, context):
-        attention_output, self_attention_probs = self.self_attention(x, attention_mask, output_attentions=True)
+        attention_output, self_attention_probs = self.self_attention(
+            x,
+            attention_mask,
+            output_attentions=True,
+        )
         attention_output, cross_attention_probs = self.cross_attention(
             attention_output,
             encoder_hidden_states=context,
-            output_attentions=True
-        ) # [0]
+            output_attentions=True,
+        )  # [0]
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
         return layer_output, self_attention_probs, cross_attention_probs
@@ -257,19 +290,8 @@ def __init__(self, config):
         super().__init__()
         self.img_encoder = VisualEncoder(**config['img_encoder'])
         self.text_encoder = TextEncoder(**config['text_encoder'])
-
-        self.preprocess_image = Compose([
-            Resize(224, interpolation=InterpolationMode.BICUBIC),
-            lambda x: x.convert('RGB'),
-            CenterCrop(224),
-            ToTensor(),
-            Normalize(
-                mean=(0.48145466, 0.4578275, 0.40821073),
-                std=(0.26862954, 0.26130258, 0.27577711)
-            )
-        ])
-
-        self._tokenizer = AutoTokenizer.from_pretrained(config['text_encoder']['backbone'])
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            config['text_encoder']['backbone'])
 
     def encode_image(self, x: torch.Tensor, return_features=False):
         features, embs = self.img_encoder(x)
@@ -278,7 +300,7 @@ def encode_image(self, x: torch.Tensor, return_features=False):
             return features, embs
 
         return embs
-    
+
     def encode_text(self, x: dict, return_features=False):
         features, embs = self.text_encoder(x['input_ids'], x['attention_mask'])
 
@@ -293,7 +315,8 @@ def encode_multimodal(
         text: dict = None,
         image_features: torch.Tensor = None,
         text_features: torch.Tensor = None,
-        attention_mask: torch.Tensor = None):
+        attention_mask: torch.Tensor = None,
+    ):
 
         assert image is not None or image_features is not None, "Either 'image' or 'image_features' should be non None"
         assert text is not None or text_features is not None, "Either 'text_data' or 'text_features' should be non None"
@@ -318,26 +341,50 @@ def encode_multimodal(
 
     def get_matching_scores(
         self,
-        x: torch.Tensor):
+        x: torch.Tensor,
+    ):
         return self.text_encoder._logit_and_norm(x)
 
-
     def preprocess_text(self, x):
-        x = self._tokenizer(x,
-                           padding='max_length',
-                           truncation=True,
-                           return_tensors='pt',
-                           pad_to_max_length=True,
-                           max_length=77)
+        x = self._tokenizer(
+            x,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt',
+            pad_to_max_length=True,
+            max_length=77,
+        )
         if 'token_type_ids' in x:
             del x['token_type_ids']
-            
+
         return x
 
+    def preprocess_image(self, x):
+        preprocessor = Compose([
+            Resize(224, interpolation=InterpolationMode.BICUBIC),
+            lambda x: x.convert('RGB'),
+            CenterCrop(224),
+            ToTensor(),
+            Normalize(
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711)
+            )
+        ])
+        if isinstance(x, list):
+            images = []
+            for image in x:
+                images.append(preprocessor(image))
+
+            batch_images = torch.stack(images, dim=0)
+            return batch_images
+        else:
+            return preprocessor(x).unsqueeze(0)
+
+
 def get_model(model_name, token=None):
     model_path = snapshot_download(
         repo_id=model_name,
-        token=token
+        token=token,
     )
     config_path = f'{model_path}/config.json'
     state = torch.load(f'{model_path}/weight.pt')