From a8d3c8deab91cdda902249e4445cf7b421639e82 Mon Sep 17 00:00:00 2001 From: kasra-hosseini Date: Thu, 15 Sep 2022 15:19:31 +0100 Subject: [PATCH 1/5] Add calc_cosine; Rename use_predict to calc_predict --- DeezyMatch/candidateRanker.py | 86 ++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/DeezyMatch/candidateRanker.py b/DeezyMatch/candidateRanker.py index 681fc30..8fdcce8 100644 --- a/DeezyMatch/candidateRanker.py +++ b/DeezyMatch/candidateRanker.py @@ -55,7 +55,8 @@ def __init__( num_candidates=10, search_size=4, length_diff=None, - use_predict=True, + calc_predict=False, + calc_cosine=False, output_path="ranker_output", pretrained_model_path=None, pretrained_vocab_path=None, @@ -72,7 +73,8 @@ def __init__( self.num_candidates = num_candidates self.search_size = search_size self.length_diff = length_diff - self.use_predict = use_predict + self.calc_predict = calc_predict + self.calc_cosine = calc_cosine self.output_path = output_path self.pretrained_model_path = pretrained_model_path self.pretrained_vocab_path = pretrained_vocab_path @@ -91,7 +93,8 @@ def rank(self): num_candidates=self.num_candidates, search_size=self.search_size, length_diff=self.length_diff, - use_predict=self.use_predict, + calc_predict=self.calc_predict, + calc_cosine=self.calc_cosine, output_path=self.output_path, pretrained_model_path=self.pretrained_model_path, pretrained_vocab_path=self.pretrained_vocab_path, @@ -108,7 +111,8 @@ def set_query( num_candidates=None, search_size=None, length_diff=None, - use_predict=True, + calc_predict=False, + calc_cosine=False, number_test_rows=None, output_path=None, ): @@ -126,8 +130,10 @@ def set_query( self.search_size = search_size if length_diff: self.length_diff = length_diff - if use_predict: - self.use_predict = use_predict + if calc_predict: + self.calc_predict = calc_predict + if calc_cosine: + self.calc_cosine = calc_cosine if number_test_rows: self.number_test_rows = number_test_rows if output_path: @@ -162,7 +168,8 @@ def __str__(self): msg += f"selection_threshold:\t{self.selection_threshold}\n" msg += f"search_size:\t\t{self.search_size}\n" msg += f"length_diff:\t\t{self.length_diff}\n" - msg += f"use_predict:\t\t{self.use_predict}\n" + msg += f"calc_predict:\t\t{self.calc_predict}\n" + msg += f"calc_cosine:\t\t{self.calc_cosine}\n" msg += f"number_test_rows:\t{self.number_test_rows}\n" msg += f"---I/O---\n" if self.input_file_path in ["default"]: @@ -186,7 +193,8 @@ def candidate_ranker( num_candidates=10, search_size=4, length_diff=None, - use_predict=True, + calc_predict=False, + calc_cosine=False, output_path="ranker_output", pretrained_model_path=None, pretrained_vocab_path=None, @@ -222,8 +230,10 @@ def candidate_ranker( number of candidates to be tested at each iteration length_diff max length difference allowed between query and candidate strings - use_predict - boolean on whether to use prediction in ranking or not + calc_predict + boolean on whether to calculate prediction (i.e. model inference) or not + calc_cosine + boolean on whether to calculate cosine similarity or not output_path path to the output file pretrained_model_path @@ -254,6 +264,11 @@ def candidate_ranker( # read input file dl_inputs = read_input_file(input_file_path, verbose) + if not ranking_metric.lower() in ["faiss", "cosine", "conf"]: + sys.exit( + f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. " + "Current ranking methods are: 'faiss', 'cosine', 'conf'" + ) if (ranking_metric.lower() in ["faiss"]) and (selection_threshold < 0): sys.exit( f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be >= 0." @@ -264,16 +279,14 @@ def candidate_ranker( sys.exit( f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be between 0 and 1." ) - if (ranking_metric.lower() in ["conf"]) and use_predict == False: - sys.exit( - f"ranking_metric: {ranking_metric} is selected, but use_predict is set to {use_predict}" - ) - - if not ranking_metric.lower() in ["faiss", "cosine", "conf"]: - sys.exit( - f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. " - "Current ranking methods are: 'faiss', 'cosine', 'conf'" - ) + if (ranking_metric.lower() in ["conf"]) and calc_predict == False: + print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_predict is set to {calc_predict}") + print(f"[WARNING] calc_predict will be set to True.") + calc_predict = True + if (ranking_metric.lower() in ["cosine"]) and calc_cosine == False: + print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_cosine is set to {calc_cosine}") + print(f"[WARNING] calc_cosine will be set to True.") + calc_cosine = True if num_candidates == 0: sys.exit(f"[ERROR] num_candidates must be larger than 0.") @@ -404,14 +417,18 @@ def candidate_ranker( query_candidate_pd["label"] = "False" - # Compute cosine similarity - cosine_sim = cosine_similarity( - vecs_query[iq : (iq + 1)].detach().cpu().numpy(), - vecs_candidates.detach().cpu().numpy()[orig_id_candis], - ) - cosine_dist = 1.0 - cosine_sim + if calc_cosine: + # Compute cosine similarity + cosine_sim = cosine_similarity( + vecs_query[iq : (iq + 1)].detach().cpu().numpy(), + vecs_candidates.detach().cpu().numpy()[orig_id_candis], + ) + cosine_dist = 1.0 - cosine_sim + cosine_dist = cosine_dist[0] + else: + cosine_dist = [None] * len(query_candidate_pd) - if use_predict and (not pretrained_model_path in [False, None]): + if calc_predict and (not pretrained_model_path in [False, None]): all_preds = candidate_conf_calc( query_candidate_pd, model, @@ -426,7 +443,7 @@ def candidate_ranker( query_candidate_pd["faiss_dist"] = found_neighbours[0][ 0, id_0_neigh:id_1_neigh ] - query_candidate_pd["cosine_dist"] = cosine_dist[0] + query_candidate_pd["cosine_dist"] = cosine_dist query_candidate_pd["s1_orig_ids"] = orig_id_queries query_candidate_pd["s2_orig_ids"] = orig_id_candis @@ -527,13 +544,16 @@ def candidate_ranker( )[:num_candidates] for i_row, row in collect_neigh_pd.iterrows(): - if use_predict == True: + if calc_predict == True: mydict_dl_match[row["s2_orig"]] = round(row["dl_match"], 4) mydict_dl_1_minus_match[row["s2_orig"]] = 1.0 - round( row["dl_match"], 4 ) mydict_faiss_dist[row["s2_orig"]] = round(row["faiss_dist"], 4) - mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4) + if calc_cosine: + mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4) + else: + mydict_cosine_dist[row["s2_orig"]] = row["cosine_dist"] mydict_candid_id[row["s2_orig"]] = row["s2_orig_ids"] one_row = { "id": orig_id_queries, @@ -574,7 +594,8 @@ def main(): num_candidates, search_size, length_diff, - use_predict, + calc_predict, + calc_cosine, output_path, pretrained_model_path, pretrained_vocab_path, @@ -593,7 +614,8 @@ def main(): num_candidates=num_candidates, search_size=search_size, length_diff=length_diff, - use_predict=use_predict, + calc_predict=calc_predict, + calc_cosine=calc_cosine, output_path=output_path, pretrained_model_path=pretrained_model_path, pretrained_vocab_path=pretrained_vocab_path, From 46f8655f0237cf4c74c35d77956d9fe7af6f25f1 Mon Sep 17 00:00:00 2001 From: kasra-hosseini Date: Thu, 15 Sep 2022 15:23:47 +0100 Subject: [PATCH 2/5] Update README --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4da9863..daf19c8 100644 --- a/README.md +++ b/README.md @@ -855,8 +855,8 @@ As mentioned, the retrieval of candidates is based on several parameters: ``` :bangbang: In `conf` (i.e., prediction-confidence), the threshold corresponds to the **minimum** accepted value, while in `faiss` and `cosine` metrics, the threshold is the **maximum** accepted value. :bangbang: The `cosine` and `conf` scores are between [0, 1] while `faiss` distance can take any values from [0, +∞). -* **Use prediction** (`use_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly. -* **Search size** (`search_size`): Unless `use_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities). +* **Calculate prediction** (`calc_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly. +* **Search size** (`search_size`): Unless `calc_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities). * **Maximum length difference** (`length_diff`): Finally, you can also specify the maximum length difference allowed between the query and the retrieved candidate strings, which may be a useful feature for certain applications. Finally, **only for testing**, you can use `number_test_rows`. It specifies the number of queries to be used for testing. @@ -881,7 +881,8 @@ Summary of the arguments/flags: | num_candidates | -n | number of desired candidates | | search_size | -sz | number of candidates to be tested at each iteration | | length_diff | -ld | max length difference allowed between query and candidate strings | -| use_predict | -up | whether to use prediction in ranking or not | +| calc_predict | -up | whether to calculate prediction (i.e., model inference) or not | +| calc_cosine | -cc | whether to calculate cosine similarity or not | | output_path | -o | path to the output file | | pretrained_model_path | -mp | path to the pretrained model | | pretrained_vocab_path | -v | path to the pretrained vocabulary | From bcca1981c0855782920d559f13b67a28671fd7de Mon Sep 17 00:00:00 2001 From: kasra-hosseini Date: Thu, 15 Sep 2022 15:53:14 +0100 Subject: [PATCH 3/5] update tests according to the new changes --- DeezyMatch/tests/test_pipeline_one_col_input.py | 8 +++----- DeezyMatch/utils.py | 4 +++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/DeezyMatch/tests/test_pipeline_one_col_input.py b/DeezyMatch/tests/test_pipeline_one_col_input.py index a70bc93..49340c4 100644 --- a/DeezyMatch/tests/test_pipeline_one_col_input.py +++ b/DeezyMatch/tests/test_pipeline_one_col_input.py @@ -143,7 +143,6 @@ def test_pipeline_one_col_input(): selection_threshold=5.0, num_candidates=2, search_size=10, - use_predict=False, output_path="ranker_results_003/test_candidates_deezymatch", pretrained_model_path="./models/finetuned_test003/finetuned_test003.model", pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab", @@ -153,7 +152,7 @@ def test_pipeline_one_col_input(): from DeezyMatch import candidate_ranker # Select candidates based on L2-norm distance (aka faiss distance) - # where ranking_metric is conf and use_prediction is false: + # where ranking_metric is conf and calc_predict is false: candidates_pd_predfalse = candidate_ranker( query_scenario="./combined_003/queries_test", candidate_scenario="./combined_003/candidates_test", @@ -161,14 +160,13 @@ def test_pipeline_one_col_input(): selection_threshold=5.0, num_candidates=2, search_size=10, - use_predict=True, output_path="ranker_results_003/test_candidates_deezymatch", pretrained_model_path="./models/finetuned_test003/finetuned_test003.model", pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab", number_test_rows=5, ) - # Same candidates and faiss scores should be retrieved independently of use_predict value: + # Same candidates and faiss scores should be retrieved independently of calc_predict value: candidates_pd_predtrue.faiss_distance == candidates_pd_predfalse.faiss_distance from DeezyMatch import candidate_ranker @@ -184,7 +182,7 @@ def test_pipeline_one_col_input(): num_candidates=2, search_size=10, length_diff=2, - use_predict=True, + calc_predict=True, output_path="ranker_results_003/test_candidates_deezymatch", pretrained_model_path="./models/finetuned_test003/finetuned_test003.model", pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab", diff --git a/DeezyMatch/utils.py b/DeezyMatch/utils.py index df0591f..9aa3342 100644 --- a/DeezyMatch/utils.py +++ b/DeezyMatch/utils.py @@ -446,7 +446,9 @@ def read_command_candidate_ranker(): "-ld", "--length_diff", help="max length difference", default=None ) - parser.add_argument("-up", "--use_predict", help="use predict", default=True) + parser.add_argument("-up", "--calc_predict", help="calculate predict", default=False) + + parser.add_argument("-cc", "--calc_cosine", help="calculate cosine", default=False) parser.add_argument("-o", "--output_path", help="path to output file") From d59c6dc650d515212fdd2c5adc284ea3add3501e Mon Sep 17 00:00:00 2001 From: kasra-hosseini Date: Tue, 20 Sep 2022 15:18:05 +0100 Subject: [PATCH 4/5] update example notebook --- examples/example_001.ipynb | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/examples/example_001.ipynb b/examples/example_001.ipynb index fe51828..c46869d 100644 --- a/examples/example_001.ipynb +++ b/examples/example_001.ipynb @@ -18,7 +18,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ @@ -234,6 +234,39 @@ "candidates_pd" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from DeezyMatch import candidate_ranker\n", + "\n", + "# Select candidates based on L2-norm distance (aka faiss distance):\n", + "# find candidates from candidate_scenario \n", + "# for queries specified in query_scenario\n", + "candidates_pd = \\\n", + " candidate_ranker(query_scenario=\"./combined/queries_test\",\n", + " candidate_scenario=\"./combined/candidates_test\", \n", + " ranking_metric=\"cosine\", \n", + " selection_threshold=0.9, \n", + " num_candidates=2, \n", + " search_size=2, \n", + " output_path=\"ranker_results/test_candidates_deezymatch_cosine\", \n", + " pretrained_model_path=\"./models/finetuned_test001/finetuned_test001.model\", \n", + " pretrained_vocab_path=\"./models/finetuned_test001/finetuned_test001.vocab\", \n", + " number_test_rows=20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "candidates_pd" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -391,7 +424,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.12" } }, "nbformat": 4, From 6c5f536b5f383da032be7a34c6ebbc5a94b0992d Mon Sep 17 00:00:00 2001 From: kasra-hosseini Date: Tue, 20 Sep 2022 15:22:04 +0100 Subject: [PATCH 5/5] v1.3.4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ae67861..eda6a41 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setuptools.setup( name="DeezyMatch", - version="1.3.3", + version="1.3.4", description="A Flexible Deep Learning Approach to Fuzzy String Matching and Candidate Ranking", author=u"The LwM Development Team", #author_email="",