Merge pull request #136 from Living-with-machines/develop

v1.3.4
Living-with-machines · Sep 20, 2022 · b3e5504 · b3e5504
2 parents b47053f + 6c5f536
commit b3e5504
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 44 deletions.
diff --git a/DeezyMatch/candidateRanker.py b/DeezyMatch/candidateRanker.py
@@ -55,7 +55,8 @@ def __init__(
         num_candidates=10,
         search_size=4,
         length_diff=None,
-        use_predict=True,
+        calc_predict=False,
+        calc_cosine=False,
         output_path="ranker_output",
         pretrained_model_path=None,
         pretrained_vocab_path=None,
@@ -72,7 +73,8 @@ def __init__(
         self.num_candidates = num_candidates
         self.search_size = search_size
         self.length_diff = length_diff
-        self.use_predict = use_predict
+        self.calc_predict = calc_predict
+        self.calc_cosine = calc_cosine
         self.output_path = output_path
         self.pretrained_model_path = pretrained_model_path
         self.pretrained_vocab_path = pretrained_vocab_path
@@ -91,7 +93,8 @@ def rank(self):
             num_candidates=self.num_candidates,
             search_size=self.search_size,
             length_diff=self.length_diff,
-            use_predict=self.use_predict,
+            calc_predict=self.calc_predict,
+            calc_cosine=self.calc_cosine,
             output_path=self.output_path,
             pretrained_model_path=self.pretrained_model_path,
             pretrained_vocab_path=self.pretrained_vocab_path,
@@ -108,7 +111,8 @@ def set_query(
         num_candidates=None,
         search_size=None,
         length_diff=None,
-        use_predict=True,
+        calc_predict=False,
+        calc_cosine=False,
         number_test_rows=None,
         output_path=None,
     ):
@@ -126,8 +130,10 @@ def set_query(
             self.search_size = search_size
         if length_diff:
             self.length_diff = length_diff
-        if use_predict:
-            self.use_predict = use_predict
+        if calc_predict:
+            self.calc_predict = calc_predict
+        if calc_cosine:
+            self.calc_cosine = calc_cosine
         if number_test_rows:
             self.number_test_rows = number_test_rows
         if output_path:
@@ -162,7 +168,8 @@ def __str__(self):
         msg += f"selection_threshold:\t{self.selection_threshold}\n"
         msg += f"search_size:\t\t{self.search_size}\n"
         msg += f"length_diff:\t\t{self.length_diff}\n"
-        msg += f"use_predict:\t\t{self.use_predict}\n"
+        msg += f"calc_predict:\t\t{self.calc_predict}\n"
+        msg += f"calc_cosine:\t\t{self.calc_cosine}\n"
         msg += f"number_test_rows:\t{self.number_test_rows}\n"
         msg += f"---I/O---\n"
         if self.input_file_path in ["default"]:
@@ -186,7 +193,8 @@ def candidate_ranker(
     num_candidates=10,
     search_size=4,
     length_diff=None,
-    use_predict=True,
+    calc_predict=False,
+    calc_cosine=False,
     output_path="ranker_output",
     pretrained_model_path=None,
     pretrained_vocab_path=None,
@@ -222,8 +230,10 @@ def candidate_ranker(
         number of candidates to be tested at each iteration
     length_diff
         max length difference allowed between query and candidate strings
-    use_predict
-        boolean on whether to use prediction in ranking or not
+    calc_predict
+        boolean on whether to calculate prediction (i.e. model inference) or not
+    calc_cosine
+        boolean on whether to calculate cosine similarity or not
     output_path
         path to the output file
     pretrained_model_path
@@ -254,6 +264,11 @@ def candidate_ranker(
     # read input file
     dl_inputs = read_input_file(input_file_path, verbose)
 
+    if not ranking_metric.lower() in ["faiss", "cosine", "conf"]:
+        sys.exit(
+            f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. "
+            "Current ranking methods are: 'faiss', 'cosine', 'conf'"
+        )
     if (ranking_metric.lower() in ["faiss"]) and (selection_threshold < 0):
         sys.exit(
             f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be >= 0."
@@ -264,16 +279,14 @@ def candidate_ranker(
         sys.exit(
             f"[ERROR] Threshold for the selected metric: '{ranking_metric}' should be between 0 and 1."
         )
-    if (ranking_metric.lower() in ["conf"]) and use_predict == False:
-        sys.exit(
-            f"ranking_metric: {ranking_metric} is selected, but use_predict is set to {use_predict}"
-        )
-
-    if not ranking_metric.lower() in ["faiss", "cosine", "conf"]:
-        sys.exit(
-            f"[ERROR] ranking_metric of {ranking_metric.lower()} is not supported. "
-            "Current ranking methods are: 'faiss', 'cosine', 'conf'"
-        )
+    if (ranking_metric.lower() in ["conf"]) and calc_predict == False:
+        print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_predict is set to {calc_predict}")
+        print(f"[WARNING] calc_predict will be set to True.")
+        calc_predict = True
+    if (ranking_metric.lower() in ["cosine"]) and calc_cosine == False:
+        print(f"[WARNING] ranking_metric: {ranking_metric} is selected, but calc_cosine is set to {calc_cosine}")
+        print(f"[WARNING] calc_cosine will be set to True.")
+        calc_cosine = True
 
     if num_candidates == 0:
         sys.exit(f"[ERROR] num_candidates must be larger than 0.")
@@ -404,14 +417,18 @@ def candidate_ranker(
 
             query_candidate_pd["label"] = "False"
 
-            # Compute cosine similarity
-            cosine_sim = cosine_similarity(
-                vecs_query[iq : (iq + 1)].detach().cpu().numpy(),
-                vecs_candidates.detach().cpu().numpy()[orig_id_candis],
-            )
-            cosine_dist = 1.0 - cosine_sim
+            if calc_cosine:
+                # Compute cosine similarity
+                cosine_sim = cosine_similarity(
+                    vecs_query[iq : (iq + 1)].detach().cpu().numpy(),
+                    vecs_candidates.detach().cpu().numpy()[orig_id_candis],
+                )
+                cosine_dist = 1.0 - cosine_sim
+                cosine_dist = cosine_dist[0]
+            else:
+                cosine_dist = [None] * len(query_candidate_pd)
 
-            if use_predict and (not pretrained_model_path in [False, None]):
+            if calc_predict and (not pretrained_model_path in [False, None]):
                 all_preds = candidate_conf_calc(
                     query_candidate_pd,
                     model,
@@ -426,7 +443,7 @@ def candidate_ranker(
             query_candidate_pd["faiss_dist"] = found_neighbours[0][
                 0, id_0_neigh:id_1_neigh
             ]
-            query_candidate_pd["cosine_dist"] = cosine_dist[0]
+            query_candidate_pd["cosine_dist"] = cosine_dist
             query_candidate_pd["s1_orig_ids"] = orig_id_queries
             query_candidate_pd["s2_orig_ids"] = orig_id_candis
 
@@ -527,13 +544,16 @@ def candidate_ranker(
             )[:num_candidates]
 
         for i_row, row in collect_neigh_pd.iterrows():
-            if use_predict == True:
+            if calc_predict == True:
                 mydict_dl_match[row["s2_orig"]] = round(row["dl_match"], 4)
                 mydict_dl_1_minus_match[row["s2_orig"]] = 1.0 - round(
                     row["dl_match"], 4
                 )
             mydict_faiss_dist[row["s2_orig"]] = round(row["faiss_dist"], 4)
-            mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4)
+            if calc_cosine:
+                mydict_cosine_dist[row["s2_orig"]] = round(row["cosine_dist"], 4)
+            else:
+                mydict_cosine_dist[row["s2_orig"]] = row["cosine_dist"]
             mydict_candid_id[row["s2_orig"]] = row["s2_orig_ids"]
         one_row = {
             "id": orig_id_queries,
@@ -574,7 +594,8 @@ def main():
         num_candidates,
         search_size,
         length_diff,
-        use_predict,
+        calc_predict,
+        calc_cosine,
         output_path,
         pretrained_model_path,
         pretrained_vocab_path,
@@ -593,7 +614,8 @@ def main():
         num_candidates=num_candidates,
         search_size=search_size,
         length_diff=length_diff,
-        use_predict=use_predict,
+        calc_predict=calc_predict,
+        calc_cosine=calc_cosine,
         output_path=output_path,
         pretrained_model_path=pretrained_model_path,
         pretrained_vocab_path=pretrained_vocab_path,

diff --git a/DeezyMatch/tests/test_pipeline_one_col_input.py b/DeezyMatch/tests/test_pipeline_one_col_input.py
@@ -143,7 +143,6 @@ def test_pipeline_one_col_input():
         selection_threshold=5.0,
         num_candidates=2,
         search_size=10,
-        use_predict=False,
         output_path="ranker_results_003/test_candidates_deezymatch",
         pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
         pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
@@ -153,22 +152,21 @@ def test_pipeline_one_col_input():
     from DeezyMatch import candidate_ranker
 
     # Select candidates based on L2-norm distance (aka faiss distance)
-    # where ranking_metric is conf and use_prediction is false:
+    # where ranking_metric is conf and calc_predict is false:
     candidates_pd_predfalse = candidate_ranker(
         query_scenario="./combined_003/queries_test",
         candidate_scenario="./combined_003/candidates_test",
         ranking_metric="faiss",
         selection_threshold=5.0,
         num_candidates=2,
         search_size=10,
-        use_predict=True,
         output_path="ranker_results_003/test_candidates_deezymatch",
         pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
         pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",
         number_test_rows=5,
     )
 
-    # Same candidates and faiss scores should be retrieved independently of use_predict value:
+    # Same candidates and faiss scores should be retrieved independently of calc_predict value:
     candidates_pd_predtrue.faiss_distance == candidates_pd_predfalse.faiss_distance
 
     from DeezyMatch import candidate_ranker
@@ -184,7 +182,7 @@ def test_pipeline_one_col_input():
         num_candidates=2,
         search_size=10,
         length_diff=2,
-        use_predict=True,
+        calc_predict=True,
         output_path="ranker_results_003/test_candidates_deezymatch",
         pretrained_model_path="./models/finetuned_test003/finetuned_test003.model",
         pretrained_vocab_path="./models/finetuned_test003/finetuned_test003.vocab",

diff --git a/DeezyMatch/utils.py b/DeezyMatch/utils.py
@@ -446,7 +446,9 @@ def read_command_candidate_ranker():
         "-ld", "--length_diff", help="max length difference", default=None
     )
 
-    parser.add_argument("-up", "--use_predict", help="use predict", default=True)
+    parser.add_argument("-up", "--calc_predict", help="calculate predict", default=False)
+
+    parser.add_argument("-cc", "--calc_cosine", help="calculate cosine", default=False)
 
     parser.add_argument("-o", "--output_path", help="path to output file")
 

diff --git a/README.md b/README.md
@@ -855,8 +855,8 @@ As mentioned, the retrieval of candidates is based on several parameters:
   ```
   :bangbang: In `conf` (i.e., prediction-confidence), the threshold corresponds to the **minimum** accepted value, while in `faiss` and `cosine` metrics, the threshold is the **maximum** accepted value.
   :bangbang: The `cosine` and `conf` scores are between [0, 1] while `faiss` distance can take any values from [0, +&#8734;).
-* **Use prediction** (`use_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly.
-* **Search size** (`search_size`): Unless `use_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities).
+* **Calculate prediction** (`calc_predict`): If the selected ranking metric is `faiss` or `cosine`, you can choose to skip prediction (by setting it to `False`), therefore speeding up the ranking significantly.
+* **Search size** (`search_size`): Unless `calc_predict` is set to `False` (and therefore the prediction step is skipped during ranking), for a given query, DeezyMatch searches for candidates iteratively. At each iteration, the selected ranking metric between a query and candidates (with the size of `search_size`) is computed, and if the number of desired candidates (specified by `num_candidates`) is not reached, a new batch of candidates with the size of `search_size` is tested in the next iteration. This continues until candidates with the size of `num_candidates` are found or all the candidates are tested. If the role of `search_size` argument is not clear, refer to [Tips / Suggestions on DeezyMatch functionalities](#tips--suggestions-on-deezymatch-functionalities).
 * **Maximum length difference** (`length_diff`): Finally, you can also specify the maximum length difference allowed between the query and the retrieved candidate strings, which may be a useful feature for certain applications.
 
 Finally, **only for testing**, you can use `number_test_rows`. It specifies the number of queries to be used for testing.
@@ -881,7 +881,8 @@ Summary of the arguments/flags:
 | num_candidates        	| -n                	| number of desired candidates                                                                                                                                                	|
 | search_size           	| -sz               	| number of candidates to be tested at each iteration                                                                                                                         	|
 | length_diff           	| -ld               	| max length difference allowed between query and candidate strings                                                                                                                         	|
-| use_predict           	| -up               	| whether to use prediction in ranking or not                                                                                                                         	|
+| calc_predict           	| -up               	| whether to calculate prediction (i.e., model inference) or not                                                                                                                         	|
+| calc_cosine           	| -cc               	| whether to calculate cosine similarity or not                                                                                                                         	|
 | output_path           	| -o                	| path to the output file                                                                                                                                                     	|
 | pretrained_model_path 	| -mp               	| path to the pretrained model                                                                                                                                                	|
 | pretrained_vocab_path 	| -v                	| path to the pretrained vocabulary                                                                                                                                           	|

diff --git a/examples/example_001.ipynb b/examples/example_001.ipynb
@@ -18,7 +18,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "scrolled": false
+    "scrolled": true
    },
    "outputs": [],
    "source": [
@@ -234,6 +234,39 @@
     "candidates_pd"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from DeezyMatch import candidate_ranker\n",
+    "\n",
+    "# Select candidates based on L2-norm distance (aka faiss distance):\n",
+    "# find candidates from candidate_scenario \n",
+    "# for queries specified in query_scenario\n",
+    "candidates_pd = \\\n",
+    "    candidate_ranker(query_scenario=\"./combined/queries_test\",\n",
+    "                     candidate_scenario=\"./combined/candidates_test\", \n",
+    "                     ranking_metric=\"cosine\", \n",
+    "                     selection_threshold=0.9, \n",
+    "                     num_candidates=2, \n",
+    "                     search_size=2, \n",
+    "                     output_path=\"ranker_results/test_candidates_deezymatch_cosine\", \n",
+    "                     pretrained_model_path=\"./models/finetuned_test001/finetuned_test001.model\", \n",
+    "                     pretrained_vocab_path=\"./models/finetuned_test001/finetuned_test001.vocab\", \n",
+    "                     number_test_rows=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "candidates_pd"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -391,7 +424,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.12"
   }
  },
  "nbformat": 4,

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setuptools.setup(
     name="DeezyMatch",
-    version="1.3.3",
+    version="1.3.4",
     description="A Flexible Deep Learning Approach to Fuzzy String Matching and Candidate Ranking",
     author=u"The LwM Development Team",
     #author_email="",