Retrieve MinHash from LSHForest (#234)

123epsilon · Arham Khan · Arham Khan · web-flow · commit f0ae48b3db01 · 2024-03-10T21:25:43.000-07:00
* add get minhash from lshforest

* format

* fix format string

* return hashvalues instead of MinHash

* preallocate hashvalue buffer

* add direct hashvalue check to test

---------

Co-authored-by: Arham Khan &lt;arham23@polaris-login-04.hsn.cm.polaris.alcf.anl.gov&gt;
Co-authored-by: Arham Khan &lt;arham23@polaris-login-01.hsn.cm.polaris.alcf.anl.gov&gt;
Co-authored-by: Arham Khan &lt;arham23@polaris-login-03.hsn.cm.polaris.alcf.anl.gov&gt;
diff --git a/datasketch/lshforest.py b/datasketch/lshforest.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from typing import Hashable, List
+import numpy as np
 
 from datasketch.minhash import MinHash
 
@@ -128,6 +129,30 @@ def query(self, minhash: MinHash, k: int) -> List[Hashable]:
             r -= 1
         return list(results)
 
+    def get_minhash_hashvalues(self, key: Hashable) -> np.ndarray:
+        """
+        Returns the hashvalues from the MinHash object that corresponds to the given key in the LSHForest,
+        if it exists. This is useful for when we want to reconstruct the original MinHash
+        object to manually check the Jaccard Similarity for the top-k results from a query.
+
+        Args:
+            key (Hashable): The key whose MinHash hashvalues we want to retrieve.
+
+        Returns:
+            hashvalues: The hashvalues for the MinHash object corresponding to the given key.
+        """
+        byteslist = self.keys.get(key, None)
+        if byteslist is None:
+            raise KeyError(f"The provided key does not exist in the LSHForest: {key}")
+        hashvalue_byte_size = len(byteslist[0])//8
+        hashvalues = np.empty(len(byteslist)*hashvalue_byte_size, dtype=np.uint64)
+        for index, item in enumerate(byteslist):
+            # unswap the bytes, as their representation is flipped during storage
+            hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap()
+            curr_index = index*hashvalue_byte_size
+            hashvalues[curr_index:curr_index+hashvalue_byte_size] = hv_segment
+        return hashvalues
+
     def _binary_search(self, n, func):
         """
         https://golang.org/src/sort/search.go?s=2247:2287#L49
diff --git a/test/test_lshforest.py b/test/test_lshforest.py
@@ -62,6 +62,18 @@ def test_query(self):
             results = forest.query(data[key], 10)
             self.assertIn(key, results)
 
+    def test_get_minhash_hashvalues(self):
+        forest, data = self._setup()
+        for key in data:
+            minhash_ori = data[key]
+            hashvalues = forest.get_minhash_hashvalues(key)
+            minhash_retrieved = MinHash(hashvalues=hashvalues)
+            retrieved_hashvalues = minhash_retrieved.hashvalues
+            self.assertEqual(len(hashvalues), len(retrieved_hashvalues))
+            self.assertEqual(minhash_retrieved.jaccard(minhash_ori), 1.0)
+            for i in range(len(retrieved_hashvalues)):
+                self.assertEqual(hashvalues[i], retrieved_hashvalues[i])
+
     def test_pickle(self):
         forest, _ = self._setup()
         forest2 = pickle.loads(pickle.dumps(forest))