Merge branch 'master' into smaller-weight-c

asappresearch · Dec 17, 2020 · 2e67644 · 2e67644
2 parents 5688728 + 2ef3e93
commit 2e67644
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,10 @@ __pycache__
 *.egg-info/
 sru/csrc/build
 .python-version
+
+# created by torchscript tests:
+py_out.txt
+sru/csrc/Makefile
+sru/csrc/tests/main_test_cpp.cpp
+sru_ts.pt
+cpp_out.txt
diff --git a/sru/modules.py b/sru/modules.py
@@ -22,7 +22,7 @@ class SRUCell(nn.Module):
                      'dropout', 'bidirectional', 'has_skip_term', 'highway_bias',
                      'v1', 'rescale', 'activation_type', 'activation', 'custom_m',
                      'projection_size', 'num_matrices', 'layer_norm', 'weight_proj',
-                     'scale_x']
+                     'scale_x', 'normalize_after', 'weight_c_init',]
 
     scale_x: Tensor
     weight_proj: Optional[Tensor]
@@ -42,7 +42,8 @@ def __init__(self,
                  v1: bool = False,
                  custom_m: Optional[nn.Module] = None,
                  amp_recurrence_fp16: bool = False,
-                 weight_c_init: Optional[float] = None):
+                 weight_c_init: Optional[float] = None,
+                 normalize_after: bool = False):
         """Initialize the SRUCell module.
 
         Parameters
@@ -97,6 +98,8 @@ def __init__(self,
             False: torch.float32, True: torch.float16
         weight_c_init: Optional[float]
             if not None, then size of uniform initiatialization of weight_c
+        normalize_after: bool
+            if True use post layer norm, else pre layer norm
         """
         super(SRUCell, self).__init__()
         self.input_size = input_size
@@ -117,6 +120,7 @@ def __init__(self,
             self.activation = 'tanh'
         self.amp_recurrence_fp16 = amp_recurrence_fp16
         self.weight_c_init = weight_c_init
+        self.normalize_after = normalize_after
 
         # projection dimension
         self.projection_size = 0
@@ -150,7 +154,10 @@ def __init__(self,
 
         self.layer_norm: Optional[nn.Module]= None
         if layer_norm:
-            self.layer_norm = nn.LayerNorm(self.input_size)
+            if normalize_after:
+                self.layer_norm = nn.LayerNorm(self.output_size)
+            else:
+                self.layer_norm = nn.LayerNorm(self.input_size)
 
         self.reset_parameters()
 
@@ -242,7 +249,7 @@ def forward(self,
 
         # apply layer norm before activation (i.e. before SRU computation)
         residual = input
-        if self.layer_norm is not None:
+        if self.layer_norm is not None and not self.normalize_after:
             input = self.layer_norm(input)
 
         # apply dropout for multiplication
@@ -267,6 +274,10 @@ def forward(self,
 
         # apply elementwise recurrence to get hidden states h and c
         h, c = self.apply_recurrence(U, V, residual, c0, scale_val, mask_c, mask_pad)
+
+        if self.layer_norm is not None and self.normalize_after:
+            h = self.layer_norm(h)
+
         return h, c
 
     def apply_recurrence(self,
@@ -435,7 +446,8 @@ def __init__(self,
                  custom_m: Optional[Union[nn.Module, List[nn.Module]]] = None,
                  proj_input_to_hidden_first: bool = False,
                  amp_recurrence_fp16: bool = False,
-                 weight_c_init: Optional[float] = None):
+                 weight_c_init: Optional[float] = None,
+                 normalize_after: bool = False):
         """Initialize the SRU module.
 
         Parameters
@@ -496,7 +508,8 @@ def __init__(self,
             False: torch.float32, True: torch.float16
         weight_c_init: Optional[float]
             if not None, then size of uniform initiatialization of weight_c
-
+        normalize_after: bool
+            if True use post layer norm, else use pre layer norm
         """
 
         super(SRU, self).__init__()
@@ -550,6 +563,7 @@ def __init__(self,
                 custom_m=custom_m_i,
                 amp_recurrence_fp16=amp_recurrence_fp16,
                 weight_c_init=weight_c_init,
+                normalize_after=normalize_after
             )
             rnn_lst.append(layer_i)
         self.rnn_lst = rnn_lst

diff --git a/test/test.sh b/test/test.sh
@@ -7,7 +7,6 @@
 set -e
 set -x
 
-python test/test_ts_cpp.py > py_out.txt
 cd sru/csrc/
 if [[ -d build ]]; then {
     rm -Rf build
@@ -17,5 +16,11 @@ cd build
 cmake -DCMAKE_PREFIX_PATH="$(python -c 'import torch; import os.path; print(os.path.join(os.path.dirname(torch.__file__), "share", "cmake"))')" ..
 make -j
 cd ../../../
+
+python test/test_ts_cpp.py > py_out.txt
+sru/csrc/build/example_app sru_ts.pt > cpp_out.txt
+diff cpp_out.txt py_out.txt
+
+python test/test_ts_cpp.py --normalize-after > py_out.txt
 sru/csrc/build/example_app sru_ts.pt > cpp_out.txt
 diff cpp_out.txt py_out.txt
diff --git a/test/test_ts_cpp.py b/test/test_ts_cpp.py
@@ -1,16 +1,26 @@
 import torch
 import sru
+import argparse
 
-D = 4
-model = sru.SRU(D, D, num_layers=2)
-model.eval()
 
-ts_model = torch.jit.script(model)
-ts_model.save('sru_ts.pt')
+def run(args):
+    D = 4
+    model = sru.SRU(D, D, num_layers=2, normalize_after=args.normalize_after)
+    model.eval()
 
-with torch.no_grad():
-    x = torch.ones(3, 2, D)
-    h, c = model(x)
-    h, c = h.view(-1), c.view(-1)
-    print(''.join(["{:.4f} ".format(x.item()) for x in h]))
-    print(''.join(["{:.4f} ".format(x.item()) for x in c]))
+    ts_model = torch.jit.script(model)
+    ts_model.save('sru_ts.pt')
+
+    with torch.no_grad():
+        x = torch.ones(3, 2, D)
+        h, c = model(x)
+        h, c = h.view(-1), c.view(-1)
+        print(''.join(["{:.4f} ".format(x.item()) for x in h]))
+        print(''.join(["{:.4f} ".format(x.item()) for x in c]))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--normalize-after', action='store_true')
+    args = parser.parse_args()
+    run(args)