|
1 | 1 | from collections import defaultdict
|
2 | 2 | from typing import Hashable, List
|
| 3 | +import numpy as np |
3 | 4 |
|
4 | 5 | from datasketch.minhash import MinHash
|
5 | 6 |
|
@@ -128,6 +129,30 @@ def query(self, minhash: MinHash, k: int) -> List[Hashable]:
|
128 | 129 | r -= 1
|
129 | 130 | return list(results)
|
130 | 131 |
|
| 132 | + def get_minhash_hashvalues(self, key: Hashable) -> np.ndarray: |
| 133 | + """ |
| 134 | + Returns the hashvalues from the MinHash object that corresponds to the given key in the LSHForest, |
| 135 | + if it exists. This is useful for when we want to reconstruct the original MinHash |
| 136 | + object to manually check the Jaccard Similarity for the top-k results from a query. |
| 137 | +
|
| 138 | + Args: |
| 139 | + key (Hashable): The key whose MinHash hashvalues we want to retrieve. |
| 140 | +
|
| 141 | + Returns: |
| 142 | + hashvalues: The hashvalues for the MinHash object corresponding to the given key. |
| 143 | + """ |
| 144 | + byteslist = self.keys.get(key, None) |
| 145 | + if byteslist is None: |
| 146 | + raise KeyError(f"The provided key does not exist in the LSHForest: {key}") |
| 147 | + hashvalue_byte_size = len(byteslist[0])//8 |
| 148 | + hashvalues = np.empty(len(byteslist)*hashvalue_byte_size, dtype=np.uint64) |
| 149 | + for index, item in enumerate(byteslist): |
| 150 | + # unswap the bytes, as their representation is flipped during storage |
| 151 | + hv_segment = np.frombuffer(item, dtype=np.uint64).byteswap() |
| 152 | + curr_index = index*hashvalue_byte_size |
| 153 | + hashvalues[curr_index:curr_index+hashvalue_byte_size] = hv_segment |
| 154 | + return hashvalues |
| 155 | + |
131 | 156 | def _binary_search(self, n, func):
|
132 | 157 | """
|
133 | 158 | https://golang.org/src/sort/search.go?s=2247:2287#L49
|
|
0 commit comments