Skip to content

Commit

Permalink
Merge pull request #114 from Living-with-machines/develop
Browse files Browse the repository at this point in the history
v1.3.0
  • Loading branch information
kasra-hosseini authored Jan 31, 2022
2 parents 6515790 + c4acf2a commit 8ee2970
Show file tree
Hide file tree
Showing 23 changed files with 6,036 additions and 5,398 deletions.
6 changes: 2 additions & 4 deletions DeezyMatch/DeezyMatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ def train(input_file_path=None, dataset_path=None, model_name=None,
preproc_steps=(dl_inputs["preprocessing"]["uni2ascii"],
dl_inputs["preprocessing"]["lowercase"],
dl_inputs["preprocessing"]["strip"],
dl_inputs["preprocessing"]["only_latin_letters"],
dl_inputs["preprocessing"]["prefix_suffix"],
dl_inputs["preprocessing"]["only_latin_letters"]
),
max_seq_len=dl_inputs['gru_lstm']['max_seq_len'],
mode=dl_inputs['gru_lstm']['mode'],
Expand Down Expand Up @@ -177,8 +176,7 @@ def finetune(input_file_path=None, dataset_path=None, model_name=None,
preproc_steps=(dl_inputs["preprocessing"]["uni2ascii"],
dl_inputs["preprocessing"]["lowercase"],
dl_inputs["preprocessing"]["strip"],
dl_inputs["preprocessing"]["only_latin_letters"],
dl_inputs["preprocessing"]["prefix_suffix"],
dl_inputs["preprocessing"]["only_latin_letters"]
),
max_seq_len=dl_inputs['gru_lstm']['max_seq_len'],
mode=dl_inputs['gru_lstm']['mode'],
Expand Down
87 changes: 59 additions & 28 deletions DeezyMatch/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
import numpy as np
import os
import pandas as pd
import re
import time
from tqdm import tqdm
import unicodedata
import pickle
from torch.utils.data import Dataset

Expand Down Expand Up @@ -102,36 +100,45 @@ def csv_split_tokenize(dataset_path, pretrained_vocab_path=None, n_train_example
dataset_split["s2_unicode"] = dataset_split["s2"].apply(normalizeString, args=preproc_steps)

cprint('[INFO]', bc.dgreen, "-- create vocabulary")
dataset_split["s1_unicode"] = dataset_split["s1_unicode"].apply(lambda x: string_split(x,
dataset_split["s1_tokenized"] = dataset_split["s1_unicode"].apply(lambda x: string_split(x,
tokenize=mode["tokenize"],
min_gram=mode["min_gram"],
max_gram=mode["max_gram"]))
dataset_split["s2_unicode"] = dataset_split["s2_unicode"].apply(lambda x: string_split(x,
max_gram=mode["max_gram"],
token_sep=mode["token_sep"],
prefix_suffix=mode["prefix_suffix"]))
dataset_split["s2_tokenized"] = dataset_split["s2_unicode"].apply(lambda x: string_split(x,
tokenize=mode["tokenize"],
min_gram=mode["min_gram"],
max_gram=mode["max_gram"]))
max_gram=mode["max_gram"],
token_sep=mode["token_sep"],
prefix_suffix=mode["prefix_suffix"]))

s1_s2_flatten = dataset_split[["s1_unicode", "s2_unicode"]].to_numpy().flatten()
s1_s2_flatten = dataset_split[["s1_tokenized", "s2_tokenized"]].to_numpy().flatten()
s1_s2_flatten_all_tokens = np.unique(np.hstack(s1_s2_flatten)).tolist()

cprint('[INFO]', bc.dgreen, "-- convert tokens to indices")
s1_unicode = dataset_split['s1_unicode'].to_list()
s2_unicode = dataset_split['s2_unicode'].to_list()
s1_tokenized = dataset_split['s1_tokenized'].to_list()
s2_tokenized = dataset_split['s2_tokenized'].to_list()

if pretrained_vocab_path:
with open(pretrained_vocab_path, 'rb') as handle:
dataset_vocab = pickle.load(handle)

# XXX we need to document the following lines
s1_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s1_unicode]
s2_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s2_unicode]
s1_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s1_tokenized]
s2_indx = [[dataset_vocab.tok2index[tok] for tok in seq if tok in dataset_vocab.tok2index] for seq in s2_tokenized]


# Compute len(s1_indx) / len(s1_tokenized)
# If this ratio is 1: all characters (after tokenization) could be found in the pretrained vocabulary
# Else: some characters are missing. If "1 - (that ratio) > missing_char_threshold", remove the entry
to_be_removed = []
for i in range(len(s1_indx)-1, -1, -1):
if (1 - len(s1_indx[i]) / max(1, len(s1_unicode[i]))) > missing_char_threshold or\
(1 - len(s2_indx[i]) / max(1, len(s2_unicode[i]))) > missing_char_threshold or\
len(s1_unicode[i]) == 0 or\
len(s2_unicode[i]) == 0:
if (1 - len(s1_indx[i]) / max(1, len(s1_tokenized[i]))) > missing_char_threshold or\
(1 - len(s2_indx[i]) / max(1, len(s2_tokenized[i]))) > missing_char_threshold or\
len(s1_tokenized[i]) == 0 or\
len(s2_tokenized[i]) == 0:
print(i, s1_indx[i], s1_tokenized[i])
to_be_removed.append(i)
del s1_indx[i]
del s2_indx[i]
Expand All @@ -153,8 +160,8 @@ def csv_split_tokenize(dataset_path, pretrained_vocab_path=None, n_train_example
dataset_vocab.addTokens(s1_s2_flatten_all_tokens)
cprint('[INFO]', bc.dgreen, f"-- Length of vocabulary: {dataset_vocab.n_tok}")

dataset_split['s1_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s1_unicode]
dataset_split['s2_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s2_unicode]
dataset_split['s1_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s1_tokenized]
dataset_split['s2_indx'] = [[dataset_vocab.tok2index[tok] for tok in seq] for seq in s2_tokenized]

# cleanup the indices
dataset_split.reset_index(drop=True, inplace=True)
Expand All @@ -177,6 +184,7 @@ def test_tokenize(dataset_path, train_vocab,missing_char_threshold=0.5,
save_test_class="./test_dc.df",
dataframe_input=False,
csv_sep="\t",
one_column_inp=False,
verbose=True
):

Expand All @@ -190,13 +198,19 @@ def test_tokenize(dataset_path, train_vocab,missing_char_threshold=0.5,
df_list = ds_fio.readlines()
for i in range(len(df_list)):
tmp_split_row = df_list[i].split(csv_sep)
#if len(tmp_split_row) != 3:

# If one_column_inp is set to True, extend the row
if one_column_inp == True:
tmp_split_row.insert(1, "tmp")
tmp_split_row.insert(2, "true")

if str(tmp_split_row[2]).strip().lower() not in ["true", "false", "1", "0"]:
print(f"SKIP: {df_list[i]}")
# change the label to remove_me,
# we drop the rows with no true|false in the label column
tmp_split_row = f"X{csv_sep}X{csv_sep}remove_me".split(csv_sep)
df_list[i] = tmp_split_row[:3]

dataset_pd = pd.DataFrame(df_list, columns=["s1", "s2", "label"])
dataset_pd["s1"] = dataset_pd["s1"].str.strip()
dataset_pd["s2"] = dataset_pd["s2"].str.strip()
Expand All @@ -221,20 +235,37 @@ def test_tokenize(dataset_path, train_vocab,missing_char_threshold=0.5,
dataset_pd = dataset_pd[:cutoff*2]
dataset_pd["s1_unicode"] = dataset_pd["s1"].apply(normalizeString, args=preproc_steps)
dataset_pd["s2_unicode"] = dataset_pd["s2"].apply(normalizeString, args=preproc_steps)

dataset_pd["s1_tokenized"] = dataset_pd["s1_unicode"].apply(lambda x: string_split(x,
tokenize=mode["tokenize"],
min_gram=mode["min_gram"],
max_gram=mode["max_gram"],
token_sep=mode["token_sep"],
prefix_suffix=mode["prefix_suffix"]))
dataset_pd["s2_tokenized"] = dataset_pd["s2_unicode"].apply(lambda x: string_split(x,
tokenize=mode["tokenize"],
min_gram=mode["min_gram"],
max_gram=mode["max_gram"],
token_sep=mode["token_sep"],
prefix_suffix=mode["prefix_suffix"]))


s1_unicode = dataset_pd['s1_unicode'].to_list()
s2_unicode = dataset_pd['s2_unicode'].to_list()
s1_tokenized = dataset_pd['s1_tokenized'].to_list()
s2_tokenized = dataset_pd['s2_tokenized'].to_list()

# XXX we need to explain why we have an if in the following for loop
s1_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s1_unicode]
s2_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s2_unicode]
# XXX we need to document the following two lines

s1_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s1_tokenized]
s2_indx = [[train_vocab.tok2index[tok] for tok in seq if tok in train_vocab.tok2index] for seq in s2_tokenized]

# Compute len(s1_indx) / len(s1_tokenized)
# If this ratio is 1: all characters (after tokenization) could be found in the pretrained vocabulary
# Else: some characters are missing. If "1 - (that ratio) > missing_char_threshold", remove the entry
to_be_removed = []
for i in range(len(s1_indx)-1, -1, -1):
if (1 - len(s1_indx[i]) / max(1, len(s1_unicode[i]))) > missing_char_threshold or\
(1 - len(s2_indx[i]) / max(1, len(s2_unicode[i]))) > missing_char_threshold or\
len(s1_unicode[i]) == 0\
or len(s2_unicode[i]) == 0:
if (1 - len(s1_indx[i]) / max(1, len(s1_tokenized[i]))) > missing_char_threshold or\
(1 - len(s2_indx[i]) / max(1, len(s2_tokenized[i]))) > missing_char_threshold or\
len(s1_tokenized[i]) == 0\
or len(s2_tokenized[i]) == 0:
to_be_removed.append(i)
del s1_indx[i]
del s2_indx[i]
Expand Down
7 changes: 5 additions & 2 deletions DeezyMatch/rnn_networks.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,7 +838,10 @@ def inference(model_path, dataset_path, train_vocab_path, input_file_path,
if inference_mode in ['test']:
output_state_vectors = False
path_save_test_class = False
one_column_inp = False
else:
# In this case, we only need one column in the input
one_column_inp = True
# Set path
scenario_path = os.path.abspath(scenario)
if not os.path.isdir(scenario_path):
Expand Down Expand Up @@ -879,13 +882,13 @@ def inference(model_path, dataset_path, train_vocab_path, input_file_path,
dl_inputs["preprocessing"]["lowercase"],
dl_inputs["preprocessing"]["strip"],
dl_inputs["preprocessing"]["only_latin_letters"],
dl_inputs["preprocessing"]["prefix_suffix"],
),
max_seq_len=dl_inputs['gru_lstm']['max_seq_len'],
mode=dl_inputs['gru_lstm']['mode'],
cutoff=test_cutoff,
save_test_class=path_save_test_class,
csv_sep=dl_inputs['preprocessing']["csv_sep"]
csv_sep=dl_inputs['preprocessing']["csv_sep"],
one_column_inp=one_column_inp
)

test_dl = DataLoader(dataset=test_dc,
Expand Down
10 changes: 5 additions & 5 deletions DeezyMatch/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ def test_train():
from DeezyMatch import train as dm_train
# train a new model
dm_train(input_file_path="./inputs/input_dfm_pytest_001.yaml",
dataset_path="./dataset/dataset-string-similarity_test.txt",
dataset_path="./dataset/dataset-string-matching_train.txt",
model_name="test001")

def test_finetune():
from DeezyMatch import finetune as dm_finetune
# fine-tune a pretrained model stored at pretrained_model_path and pretrained_vocab_path
dm_finetune(input_file_path="./inputs/input_dfm_pytest_001.yaml",
dataset_path="./dataset/dataset-string-similarity_test.txt",
dataset_path="./dataset/dataset-string-matching_finetune.txt",
model_name="finetuned_test001",
pretrained_model_path="./models/test001/test001.model",
pretrained_vocab_path="./models/test001/test001.vocab")
Expand All @@ -23,7 +23,7 @@ def test_inference():

# model inference using a model stored at pretrained_model_path and pretrained_vocab_path
dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
dataset_path="./dataset/dataset-string-similarity_test.txt",
dataset_path="./dataset/dataset-string-matching_test.txt",
pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab")

Expand All @@ -33,7 +33,7 @@ def test_generate_query_vecs():
# generate vectors for queries (specified in dataset_path)
# using a model stored at pretrained_model_path and pretrained_vocab_path
dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
dataset_path="./dataset/dataset-string-similarity_test.txt",
dataset_path="./dataset/dataset-queries.txt",
pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab",
inference_mode="vect",
Expand All @@ -45,7 +45,7 @@ def test_generate_candidate_vecs():
# generate vectors for candidates (specified in dataset_path)
# using a model stored at pretrained_model_path and pretrained_vocab_path
dm_inference(input_file_path="./inputs/input_dfm_pytest_001.yaml",
dataset_path="./dataset/dataset-string-similarity_test.txt",
dataset_path="./dataset/dataset-candidates.txt",
pretrained_model_path="./models/finetuned_test001/finetuned_test001.model",
pretrained_vocab_path="./models/finetuned_test001/finetuned_test001.vocab",
inference_mode="vect",
Expand Down
89 changes: 89 additions & 0 deletions DeezyMatch/tests/test_pipeline_ngram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import pytest


def test_pipeline_ngram():

from DeezyMatch import train as dm_train
# train a new model
dm_train(input_file_path="./inputs/input_dfm_pytest_002.yaml",
dataset_path="./dataset/dataset-string-matching_train.txt",
model_name="test002")

from DeezyMatch import finetune as dm_finetune
# fine-tune a pretrained model stored at pretrained_model_path and pretrained_vocab_path
dm_finetune(input_file_path="./inputs/input_dfm_pytest_002.yaml",
dataset_path="./dataset/dataset-string-matching_train.txt",
model_name="finetuned_test002",
pretrained_model_path="./models/test002/test002.model",
pretrained_vocab_path="./models/test002/test002.vocab")

from DeezyMatch import inference as dm_inference

# model inference using a model stored at pretrained_model_path and pretrained_vocab_path
dm_inference(input_file_path="./inputs/input_dfm_pytest_002.yaml",
dataset_path="./dataset/dataset-string-matching_train.txt",
pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab")


from DeezyMatch import inference as dm_inference

# generate vectors for queries (specified in dataset_path)
# using a model stored at pretrained_model_path and pretrained_vocab_path
dm_inference(input_file_path="./inputs/input_dfm_pytest_002.yaml",
dataset_path="./dataset/dataset-string-matching_train.txt",
pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab",
inference_mode="vect",
scenario="queries_002/test")

from DeezyMatch import inference as dm_inference

# generate vectors for candidates (specified in dataset_path)
# using a model stored at pretrained_model_path and pretrained_vocab_path
dm_inference(input_file_path="./inputs/input_dfm_pytest_002.yaml",
dataset_path="./dataset/dataset-string-matching_train.txt",
pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab",
inference_mode="vect",
scenario="candidates_002/test")


from DeezyMatch import combine_vecs

# combine vectors stored in queries/test and save them in combined/queries_test
combine_vecs(rnn_passes=['fwd', 'bwd'],
input_scenario='queries_002/test',
output_scenario='combined_002/queries_test',
print_every=10)

from DeezyMatch import combine_vecs

# combine vectors stored in candidates/test and save them in combined/candidates_test
combine_vecs(rnn_passes=['fwd', 'bwd'],
input_scenario='candidates_002/test',
output_scenario='combined_002/candidates_test',
print_every=10)

from DeezyMatch import candidate_ranker

# Select candidates based on L2-norm distance (aka faiss distance):
# find candidates from candidate_scenario
# for queries specified in query_scenario
candidates_pd = \
candidate_ranker(query_scenario="./combined_002/queries_test",
candidate_scenario="./combined_002/candidates_test",
ranking_metric="faiss",
selection_threshold=5.,
num_candidates=2,
search_size=10,
output_path="ranker_results_002/test_candidates_deezymatch",
pretrained_model_path="./models/finetuned_test002/finetuned_test002.model",
pretrained_vocab_path="./models/finetuned_test002/finetuned_test002.vocab",
number_test_rows=5)

for s in candidates_pd["query"].to_list():
assert candidates_pd.loc[candidates_pd["query"] == s]["faiss_distance"].iloc[0][s] == pytest.approx(0.0)

Loading

0 comments on commit 8ee2970

Please sign in to comment.