chorowski-lab · petropusz · Nov 30, 2020 · Nov 30, 2020 · Dec 1, 2020 · Dec 1, 2020
diff --git a/fairseq/criterions/wav2vec_criterion.py b/fairseq/criterions/wav2vec_criterion.py
@@ -14,11 +14,12 @@
 
 @register_criterion("wav2vec")
 class Wav2vecCriterion(FairseqCriterion):
-    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None):
+    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None, pass_metadata=False):
         super().__init__(task)
         self.infonce = infonce
         self.loss_weights = None if loss_weights is None else eval(loss_weights)
         self.log_keys = [] if log_keys is None else eval(log_keys)
+        self.pass_metadata = pass_metadata
 
     @staticmethod
     def add_args(parser):
@@ -30,6 +31,8 @@ def add_args(parser):
                             help='weights for additional loss terms (not first one)')
         parser.add_argument('--log-keys', type=str, default=None,
                             help='output keys to log')
+        parser.add_argument('--pass-metadata', action='store_true',
+                            help='if set, passes sample ids and epoch nr to the model (for model-specific logging of some specific-id examples per epoch etc.)')
         # fmt: on
 
     def forward(self, model, sample, reduce=True, log_pred=False):
@@ -40,7 +43,13 @@ def forward(self, model, sample, reduce=True, log_pred=False):
         2) the sample size, which is used as the denominator for the gradient
         3) logging outputs to display while training
         """
-        net_output = model(**sample["net_input"])
+        if self.pass_metadata:
+            # epoch is now also be passed in validation, but better be careful
+            net_output = model(**sample["net_input"], \
+                               id=sample["id"], \
+                               epoch=sample["epoch"].item() if "epoch" in sample else None)
+        else:
+            net_output = model(**sample["net_input"])
         logits = model.get_logits(net_output).float()
         target = model.get_targets(sample, net_output)
 

diff --git a/fairseq/data/data_utils.py b/fairseq/data/data_utils.py
@@ -469,9 +469,11 @@ def arrange(s, e, length, keep_length):
 
         mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
 
-    min_len = min([len(m) for m in mask_idcs])
+    min_len = min([len(m) for m in mask_idcs])  
+    # [!] input sequence outside padding has to have appropriate length for this to work correctly 
+    # (e.g. length 1 with min_mask=2 can cause problems)
     for i, mask_idc in enumerate(mask_idcs):
-        if len(mask_idc) > min_len:
+        if len(mask_idc) > min_len:  # they want same number of masked stuff per line as a simplification
             mask_idc = np.random.choice(mask_idc, min_len, replace=False)
         mask[i, mask_idc] = True
 

diff --git a/fairseq/data/handwriting/alphabet.py b/fairseq/data/handwriting/alphabet.py
@@ -62,9 +62,10 @@ class Alphabet:
     """
     def __init__(self, filename_=None, input_dict=None,
                  translation_dict={'_': ' '},
-                 unk=("@",), blank=("*",), space=(' ', '_')):
+                 unk=("@",), blank=("*",), space=(' ', '_'),
+                 ensure_in_dict_on_no_vocab=None): # option for ensuring chars in case the vocab is not given
 
-        if filename_:
+        if filename_:  # both None and '' will be 'False'
             self.chars = bidict(self.readDictionary(filename_))
             print('Alphabet constructed from', filename_,
                   'size=', len(self.chars))
@@ -73,12 +74,18 @@ def __init__(self, filename_=None, input_dict=None,
             print('Alphabet constructed from dictionnary, '
                   'size=', len(self.chars))
         else:
-            self.chars = bidict({
+            base_special_dict = {
                     k: i
                     for i, chs in enumerate([blank, space, unk])
                     for k in chs
-            })
+            }
+            if ensure_in_dict_on_no_vocab:
+                for c in ensure_in_dict_on_no_vocab:
+                    if c not in base_special_dict:
+                        base_special_dict[c] = len(base_special_dict)
+            self.chars = bidict(base_special_dict)
             print('Alphabet constructed empty')
+
         for c in unk:
             if c not in self.chars:
                 print('Warning: UNK token', c, 'not in vocab')

diff --git a/fairseq/data/handwriting/handwriting_dictionary.py b/fairseq/data/handwriting/handwriting_dictionary.py
@@ -16,11 +16,11 @@ def __init__(
         ):  #extra_special_symbols=None,):
 
         # [!] bos, pad, eos etc. need to be in dict file
-        super().__init__(alphabet_file, unk=(unk,))
+        super().__init__(alphabet_file, unk=(unk,), ensure_in_dict_on_no_vocab=(bos, pad, eos, unk))
         #self._alphabet = Alphabet(alphabet_file, unk=(unk,))  
         for c, descr in zip((bos, pad, eos, unk), ("bos", "pad", "eos", "unk")):
             if not self.existDict(c):
-                print('WARNING:', descr, 'token', c, 'not in vocab')
+                print('ERROR:', descr, 'token', c, 'not in vocab and vocab chosen, not constructed')
         self.bos_char, self.unk_char, self.pad_char, self.eos_char = bos, unk, pad, eos
         #self.symbols = []
         #self.count = []