Merge pull request #114 from Living-with-machines/develop

v1.3.0
Living-with-machines · Jan 31, 2022 · 8ee2970 · 8ee2970
2 parents 6515790 + c4acf2a
commit 8ee2970
Show file tree

Hide file tree

Showing 23 changed files with 6,036 additions and 5,398 deletions.
diff --git a/DeezyMatch/DeezyMatch.py b/DeezyMatch/DeezyMatch.py
@@ -74,8 +74,7 @@ def train(input_file_path=None, dataset_path=None, model_name=None,
         preproc_steps=(dl_inputs["preprocessing"]["uni2ascii"],
                        dl_inputs["preprocessing"]["lowercase"],
                        dl_inputs["preprocessing"]["strip"],
-                       dl_inputs["preprocessing"]["only_latin_letters"],
-                       dl_inputs["preprocessing"]["prefix_suffix"],
+                       dl_inputs["preprocessing"]["only_latin_letters"]
                        ),
         max_seq_len=dl_inputs['gru_lstm']['max_seq_len'],
         mode=dl_inputs['gru_lstm']['mode'],
@@ -177,8 +176,7 @@ def finetune(input_file_path=None, dataset_path=None, model_name=None,
         preproc_steps=(dl_inputs["preprocessing"]["uni2ascii"],
                        dl_inputs["preprocessing"]["lowercase"],
                        dl_inputs["preprocessing"]["strip"],
-                       dl_inputs["preprocessing"]["only_latin_letters"],
-                       dl_inputs["preprocessing"]["prefix_suffix"],
+                       dl_inputs["preprocessing"]["only_latin_letters"]
                        ),
         max_seq_len=dl_inputs['gru_lstm']['max_seq_len'],
         mode=dl_inputs['gru_lstm']['mode'],

diff --git a/DeezyMatch/data_processing.py b/DeezyMatch/data_processing.py
@@ -4,10 +4,8 @@
 import numpy as np
 import os
 import pandas as pd
-import re
 import time
 from tqdm import tqdm
-import unicodedata
 import pickle
 from torch.utils.data import Dataset
 
@@ -102,36 +100,45 @@ def csv_split_tokenize(dataset_path, pretrained_vocab_path=None, n_train_example
     dataset_split["s2_unicode"] = dataset_split["s2"].apply(normalizeString, args=preproc_steps)
 
     cprint('[INFO]', bc.dgreen, "-- create vocabulary")
-    dataset_split["s1_unicode"] = dataset_split["s1_unicode"].apply(lambda x: string_split(x, 
+    dataset_split["s1_tokenized"] = dataset_split["s1_unicode"].apply(lambda x: string_split(x, 
                                                                                            tokenize=mode["tokenize"], 
                                                                                            min_gram=mode["min_gram"], 
-                                                                                           max_gram=mode["max_gram"]))
-    dataset_split["s2_unicode"] = dataset_split["s2_unicode"].apply(lambda x: string_split(x, 
+                                                                                           max_gram=mode["max_gram"],
+                                                                                           token_sep=mode["token_sep"],
+                                                                                           prefix_suffix=mode["prefix_suffix"]))
+    dataset_split["s2_tokenized"] = dataset_split["s2_unicode"].apply(lambda x: string_split(x, 
                                                                                            tokenize=mode["tokenize"], 
                                                                                            min_gram=mode["min_gram"], 
-                                                                                           max_gram=mode["max_gram"]))
+                                                                                           max_gram=mode["max_gram"],
+                                                                                           token_sep=mode["token_sep"],
+                                                                                           prefix_suffix=mode["prefix_suffix"]))
 
-    s1_s2_flatten = dataset_split[["s1_unicode", "s2_unicode"]].to_numpy().flatten()
+    s1_s2_flatten = dataset_split[["s1_tokenized", "s2_tokenized"]].to_numpy().flatten()
     s1_s2_flatten_all_tokens = np.unique(np.hstack(s1_s2_flatten)).tolist()
 
     cprint('[INFO]', bc.dgreen, "-- convert tokens to indices")
-    s1_unicode = dataset_split['s1_unicode'].to_list()
-    s2_unicode = dataset_split['s2_unicode'].to_list()
+    s1_tokenized = dataset_split['s1_tokenized'].to_list()
+    s2_tokenized = dataset_split['s2_tokenized'].to_list()
 
     if pretrained_vocab_path:
         with open(pretrained_vocab_path, 'rb') as handle:
             dataset_vocab = pickle.load(handle)
 
         # XXX we need to document the following lines
-        s1_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s1_unicode]
-        s2_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s2_unicode]
+        s1_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s1_tokenized]
+        s2_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s2_tokenized]
 
+
+        # Compute len(s1_indx) / len(s1_tokenized)
+        # If this ratio is 1: all characters (after tokenization) could be found in the pretrained vocabulary
+        # Else: some characters are missing. If "1 - (that ratio) > missing_char_threshold", remove the entry
         to_be_removed = []
         for i in range(len(s1_indx)-1, -1, -1):
-            if (1 - len(s1_indx[i]) / max(1, len(s1_unicode[i]))) > missing_char_threshold or\
-                    (1 - len(s2_indx[i]) / max(1, len(s2_unicode[i]))) > missing_char_threshold or\
-                    len(s1_unicode[i]) == 0 or\
-                    len(s2_unicode[i]) == 0:
+            if (1 - len(s1_indx[i]) / max(1, len(s1_tokenized[i]))) > missing_char_threshold or\
+                    (1 - len(s2_indx[i]) / max(1, len(s2_tokenized[i]))) > missing_char_threshold or\
+                    len(s1_tokenized[i]) == 0 or\
+                    len(s2_tokenized[i]) == 0:
+                print(i, s1_indx[i], s1_tokenized[i])
                 to_be_removed.append(i)
                 del s1_indx[i]
                 del s2_indx[i]
@@ -153,8 +160,8 @@ def csv_split_tokenize(dataset_path, pretrained_vocab_path=None, n_train_example
         dataset_vocab.addTokens(s1_s2_flatten_all_tokens)
         cprint('[INFO]', bc.dgreen, f"-- Length of vocabulary: {dataset_vocab.n_tok}") 
 
-        dataset_split['s1_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s1_unicode]
-        dataset_split['s2_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s2_unicode]
+        dataset_split['s1_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s1_tokenized]
+        dataset_split['s2_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s2_tokenized]
 
     # cleanup the indices
     dataset_split.reset_index(drop=True, inplace=True)
@@ -177,6 +184,7 @@ def test_tokenize(dataset_path, train_vocab,missing_char_threshold=0.5,
                   save_test_class="./test_dc.df",
                   dataframe_input=False,
                   csv_sep="\t",
+                  one_column_inp=False,
                   verbose=True
                   ):
 
@@ -190,13 +198,19 @@ def test_tokenize(dataset_path, train_vocab,missing_char_threshold=0.5,
         df_list = ds_fio.readlines()
         for i in range(len(df_list)):
             tmp_split_row = df_list[i].split(csv_sep)
-            #if len(tmp_split_row) != 3:
+
+            # If one_column_inp is set to True, extend the row
+            if one_column_inp == True:
+                tmp_split_row.insert(1, "tmp")
+                tmp_split_row.insert(2, "true")
+
             if str(tmp_split_row[2]).strip().lower() not in ["true", "false", "1", "0"]:
                 print(f"SKIP: {df_list[i]}")
                 # change the label to remove_me, 
                 # we drop the rows with no true|false in the label column
                 tmp_split_row = f"X{csv_sep}X{csv_sep}remove_me".split(csv_sep)
             df_list[i] = tmp_split_row[:3]
+
         dataset_pd = pd.DataFrame(df_list, columns=["s1", "s2", "label"])
         dataset_pd["s1"] = dataset_pd["s1"].str.strip()
         dataset_pd["s2"] = dataset_pd["s2"].str.strip()
@@ -221,20 +235,37 @@ def test_tokenize(dataset_path, train_vocab,missing_char_threshold=0.5,
     dataset_pd = dataset_pd[:cutoff*2]
     dataset_pd["s1_unicode"] = dataset_pd["s1"].apply(normalizeString, args=preproc_steps)
     dataset_pd["s2_unicode"] = dataset_pd["s2"].apply(normalizeString, args=preproc_steps)
+
+    dataset_pd["s1_tokenized"] = dataset_pd["s1_unicode"].apply(lambda x: string_split(x, 
+                                                                                     tokenize=mode["tokenize"], 
+                                                                                     min_gram=mode["min_gram"], 
+                                                                                     max_gram=mode["max_gram"],
+                                                                                     token_sep=mode["token_sep"],
+                                                                                     prefix_suffix=mode["prefix_suffix"]))
+    dataset_pd["s2_tokenized"] = dataset_pd["s2_unicode"].apply(lambda x: string_split(x, 
+                                                                                     tokenize=mode["tokenize"], 
+                                                                                     min_gram=mode["min_gram"], 
+                                                                                     max_gram=mode["max_gram"],
+                                                                                     token_sep=mode["token_sep"],
+                                                                                     prefix_suffix=mode["prefix_suffix"]))
+
 
-    s1_unicode = dataset_pd['s1_unicode'].to_list()
-    s2_unicode = dataset_pd['s2_unicode'].to_list()
+    s1_tokenized = dataset_pd['s1_tokenized'].to_list()
+    s2_tokenized = dataset_pd['s2_tokenized'].to_list()
+
     # XXX we need to explain why we have an if in the following for loop
-    s1_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s1_unicode]
-    s2_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s2_unicode]
-    # XXX we need to document the following two lines
-
+    s1_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s1_tokenized]
+    s2_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s2_tokenized]
+
+    # Compute len(s1_indx) / len(s1_tokenized)
+    # If this ratio is 1: all characters (after tokenization) could be found in the pretrained vocabulary
+    # Else: some characters are missing. If "1 - (that ratio) > missing_char_threshold", remove the entry
     to_be_removed = []
     for i in range(len(s1_indx)-1, -1, -1):
-        if (1 - len(s1_indx[i]) / max(1, len(s1_unicode[i]))) > missing_char_threshold or\
-                (1 - len(s2_indx[i]) / max(1, len(s2_unicode[i]))) > missing_char_threshold or\
-                len(s1_unicode[i]) == 0\
-                or len(s2_unicode[i]) == 0:
+        if (1 - len(s1_indx[i]) / max(1, len(s1_tokenized[i]))) > missing_char_threshold or\
+                (1 - len(s2_indx[i]) / max(1, len(s2_tokenized[i]))) > missing_char_threshold or\
+                len(s1_tokenized[i]) == 0\
+                or len(s2_tokenized[i]) == 0:
             to_be_removed.append(i)
             del s1_indx[i]
             del s2_indx[i]

diff --git a/DeezyMatch/rnn_networks.py b/DeezyMatch/rnn_networks.py
@@ -838,7 +838,10 @@ def inference(model_path, dataset_path, train_vocab_path, input_file_path,
     if inference_mode in ['test']:
         output_state_vectors = False
         path_save_test_class = False
+        one_column_inp = False
     else:
+        # In this case, we only need one column in the input
+        one_column_inp = True
         # Set path
         scenario_path = os.path.abspath(scenario)
         if not os.path.isdir(scenario_path):
@@ -879,13 +882,13 @@ def inference(model_path, dataset_path, train_vocab_path, input_file_path,
                        dl_inputs["preprocessing"]["lowercase"],
                        dl_inputs["preprocessing"]["strip"],
                        dl_inputs["preprocessing"]["only_latin_letters"],
-                       dl_inputs["preprocessing"]["prefix_suffix"],
                        ),
         max_seq_len=dl_inputs['gru_lstm']['max_seq_len'],
         mode=dl_inputs['gru_lstm']['mode'],
         cutoff=test_cutoff, 
         save_test_class=path_save_test_class,
-        csv_sep=dl_inputs['preprocessing']["csv_sep"]
+        csv_sep=dl_inputs['preprocessing']["csv_sep"],
+        one_column_inp=one_column_inp
         )
 
     test_dl = DataLoader(dataset=test_dc, 

diff --git a/DeezyMatch/tests/test_pipeline.py b/DeezyMatch/tests/test_pipeline.py
@@ -6,14 +6,14 @@ def test_train():
     from DeezyMatch import train as dm_train
     # train a new model
     dm_train(input_file_path="./inputs/input_dfm_pytest_001.yaml",
-             dataset_path="./dataset/dataset-string-similarity_test.txt",
+             dataset_path="./dataset/dataset-string-matching_train.txt",
              model_name="test001")
 
 def test_finetune():
     from DeezyMatch import finetune as dm_finetune
     # fine-tune a pretrained model stored at pretrained_model_path and pretrained_vocab_path
     dm_finetune(input_file_path="./inputs/input_dfm_pytest_001.yaml",
-                dataset_path="./dataset/dataset-string-similarity_test.txt",
+                dataset_path="./dataset/dataset-string-matching_finetune.txt",
                 model_name="finetuned_test001",
                 pretrained_model_path="./models/test001/test001.model",
                 pretrained_vocab_path="./models/test001/test001.vocab")
@@ -23,7 +23,7 @@ def test_inference():
 
     # model inference using a model stored at pretrained_model_path and pretrained_vocab_path
     dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
-                 dataset_path="./dataset/dataset-string-similarity_test.txt",
+                 dataset_path="./dataset/dataset-string-matching_test.txt",
                  pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
                  pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab")
 
@@ -33,7 +33,7 @@ def test_generate_query_vecs():
     # generate vectors for queries (specified in dataset_path)
     # using a model stored at pretrained_model_path and pretrained_vocab_path
     dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
-                 dataset_path="./dataset/dataset-string-similarity_test.txt",
+                 dataset_path="./dataset/dataset-queries.txt",
                  pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
                  pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab",
                  inference_mode="vect",
@@ -45,7 +45,7 @@ def test_generate_candidate_vecs():
     # generate vectors for candidates (specified in dataset_path)
     # using a model stored at pretrained_model_path and pretrained_vocab_path
     dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
-                 dataset_path="./dataset/dataset-string-similarity_test.txt",
+                 dataset_path="./dataset/dataset-candidates.txt",
                  pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
                  pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab",
                  inference_mode="vect",

diff --git a/DeezyMatch/tests/test_pipeline_ngram.py b/DeezyMatch/tests/test_pipeline_ngram.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+import pytest
+
+
+def test_pipeline_ngram():
+
+    from DeezyMatch import train as dm_train
+    # train a new model
+    dm_train(input_file_path="./inputs/input_dfm_pytest_002.yaml",
+             dataset_path="./dataset/dataset-string-matching_train.txt",
+             model_name="test002")
+
+    from DeezyMatch import finetune as dm_finetune
+    # fine-tune a pretrained model stored at pretrained_model_path and pretrained_vocab_path
+    dm_finetune(input_file_path="./inputs/input_dfm_pytest_002.yaml",
+                dataset_path="./dataset/dataset-string-matching_train.txt",
+                model_name="finetuned_test002",
+                pretrained_model_path="./models/test002/test002.model",
+                pretrained_vocab_path="./models/test002/test002.vocab")
+
+    from DeezyMatch import inference as dm_inference
+
+    # model inference using a model stored at pretrained_model_path and pretrained_vocab_path
+    dm_inference(input_file_path="./inputs/input_dfm_pytest_002.yaml",
+                 dataset_path="./dataset/dataset-string-matching_train.txt",
+                 pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
+                 pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab")
+
+
+    from DeezyMatch import inference as dm_inference
+
+    # generate vectors for queries (specified in dataset_path)
+    # using a model stored at pretrained_model_path and pretrained_vocab_path
+    dm_inference(input_file_path="./inputs/input_dfm_pytest_002.yaml",
+                 dataset_path="./dataset/dataset-string-matching_train.txt",
+                 pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
+                 pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab",
+                 inference_mode="vect",
+                 scenario="queries_002/test")
+
+    from DeezyMatch import inference as dm_inference
+
+    # generate vectors for candidates (specified in dataset_path)
+    # using a model stored at pretrained_model_path and pretrained_vocab_path
+    dm_inference(input_file_path="./inputs/input_dfm_pytest_002.yaml",
+                 dataset_path="./dataset/dataset-string-matching_train.txt",
+                 pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
+                 pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab",
+                 inference_mode="vect",
+                 scenario="candidates_002/test")
+
+
+    from DeezyMatch import combine_vecs
+
+    # combine vectors stored in queries/test and save them in combined/queries_test
+    combine_vecs(rnn_passes=['fwd', 'bwd'], 
+                 input_scenario='queries_002/test', 
+                 output_scenario='combined_002/queries_test', 
+                 print_every=10)
+
+    from DeezyMatch import combine_vecs
+
+    # combine vectors stored in candidates/test and save them in combined/candidates_test
+    combine_vecs(rnn_passes=['fwd', 'bwd'],
+                 input_scenario='candidates_002/test',
+                 output_scenario='combined_002/candidates_test',
+                 print_every=10)
+
+    from DeezyMatch import candidate_ranker
+
+    # Select candidates based on L2-norm distance (aka faiss distance):
+    # find candidates from candidate_scenario
+    # for queries specified in query_scenario
+    candidates_pd = \
+        candidate_ranker(query_scenario="./combined_002/queries_test",
+                         candidate_scenario="./combined_002/candidates_test",
+                         ranking_metric="faiss",
+                         selection_threshold=5.,
+                         num_candidates=2,
+                         search_size=10,
+                         output_path="ranker_results_002/test_candidates_deezymatch",
+                         pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
+                         pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab",
+                         number_test_rows=5)
+
+    for s in candidates_pd["query"].to_list():
+        assert candidates_pd.loc[candidates_pd["query"] == s]["faiss_distance"].iloc[0][s] == pytest.approx(0.0)
+