Merge pull request #276 from ashvardanian/main-dev

unum-cloud · Sep 25, 2023 · ff65b13 · ff65b13
2 parents 2b7e32b + e1b132e
commit ff65b13
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 27 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -145,6 +145,7 @@
         "SLOC",
         "sorensen",
         "tanimoto",
+        "tqdm",
         "uninitialize",
         "unumusearch",
         "usearch",

diff --git a/include/usearch/index_plugins.hpp b/include/usearch/index_plugins.hpp
@@ -1602,7 +1602,7 @@ class flat_hash_multi_set_gt {
         auto slot_pointer = bucket_pointer + sizeof(bucket_header_t) + sizeof(element_t) * in_bucket_index;
         return {
             *reinterpret_cast<bucket_header_t*>(bucket_pointer),
-            std::uint64_t(1ull) << in_bucket_index,
+            static_cast<std::uint64_t>(1ull) << in_bucket_index,
             *reinterpret_cast<element_t*>(slot_pointer),
         };
     }

diff --git a/python/lib.cpp b/python/lib.cpp
@@ -532,49 +532,69 @@ static void search_typed_brute_force(                                //
         threads = std::thread::hardware_concurrency();
 
     std::size_t tasks_count = static_cast<std::size_t>(dataset_count * queries_count);
+    executor_default_t executor{threads};
 
     // Progress status
     progress_t progress_{progress};
     std::atomic<std::size_t> processed{0};
 
-    // Allocate temporary memory to store the distance matrix
-    // Previous version didn't need temporary memory, but the performance was much lower
     struct dense_key_and_distance_t {
         u32_t offset;
         f32_t distance;
     };
-    std::vector<dense_key_and_distance_t> keys_and_distances(tasks_count);
-
-    executor_default_t{threads}.dynamic(tasks_count, [&](std::size_t thread_idx, std::size_t task_idx) {
-        std::size_t dataset_idx = task_idx / queries_count;
-        std::size_t query_idx = task_idx % queries_count;
+    auto less = [](dense_key_and_distance_t const& a, dense_key_and_distance_t const& b) {
+        return a.distance < b.distance;
+    };
 
+    // Allocate temporary memory to store the distance matrix
+    // Previous version didn't need temporary memory, but the performance was much lower.
+    // In the new design we keep two buffers - original and transposed, as in-place transpositions
+    // of non-rectangular matrixes is expensive.
+    std::vector<dense_key_and_distance_t> keys_and_distances(tasks_count * 2);
+    dense_key_and_distance_t* keys_and_distances_per_dataset = keys_and_distances.data();
+    dense_key_and_distance_t* keys_and_distances_per_query = keys_and_distances_per_dataset + tasks_count;
+    executor.dynamic(dataset_count, [&](std::size_t thread_idx, std::size_t dataset_idx) {
         byte_t const* dataset = dataset_data + dataset_idx * dataset_info.strides[0];
+        for (std::size_t query_idx = 0; query_idx != queries_count; ++query_idx) {
         byte_t const* query = queries_data + query_idx * queries_info.strides[0];
         distance_t distance = metric(dataset, query);
-
-        keys_and_distances[task_idx] = {static_cast<u32_t>(query_idx), static_cast<f32_t>(distance)};
+            std::size_t task_idx = dataset_idx * queries_count + query_idx;
+            keys_and_distances_per_dataset[task_idx].offset = static_cast<u32_t>(dataset_idx);
+            keys_and_distances_per_dataset[task_idx].distance = static_cast<f32_t>(distance);
+        }
 
         // We don't want to check for signals from multiple threads
-        ++processed;
+        processed += queries_count;
         if (thread_idx == 0)
             if (PyErr_CheckSignals() != 0 || !progress_(processed.load(), tasks_count))
                 return false;
         return true;
     });
+    if (processed.load() != tasks_count)
+        return;
+
+    // Transpose in a single thread to avoid contention writing into the same memory buffers
+    for (std::size_t query_idx = 0; query_idx != queries_count; ++query_idx) {
+        for (std::size_t dataset_idx = 0; dataset_idx != dataset_count; ++dataset_idx) {
+            std::size_t from_idx = queries_count * dataset_idx + query_idx;
+            std::size_t to_idx = dataset_count * query_idx + dataset_idx;
+            keys_and_distances_per_query[to_idx] = keys_and_distances_per_dataset[from_idx];
+        }
+    }
 
     // Partial-sort every query result
-    executor_default_t{threads}.fixed(queries_count, [&](std::size_t, std::size_t query_idx) {
-        auto start = keys_and_distances.data() + query_idx * dataset_count;
-        std::partial_sort(start, start + wanted, start + dataset_count,
-                          [](dense_key_and_distance_t const& a, dense_key_and_distance_t const& b) {
-                              return a.distance < b.distance;
-                          });
-
+    executor.fixed(queries_count, [&](std::size_t, std::size_t query_idx) {
         dense_key_t* keys = &keys_py2d(query_idx, 0);
         distance_t* distances = &distances_py2d(query_idx, 0);
+        auto start = keys_and_distances_per_query + dataset_count * query_idx;
+        if (wanted > 1) {
+            std::partial_sort(start, start + wanted, start + dataset_count, less);
         for (std::size_t i = 0; i != wanted; ++i)
             keys[i] = static_cast<dense_key_t>(start[i].offset), distances[i] = start[i].distance;
+        } else {
+            auto max_it = std::max_element(start, start + dataset_count, less);
+            keys[0] = static_cast<dense_key_t>(max_it->offset), distances[0] = max_it->distance;
+        }
     });
 
     // At the end report the latest numbers, because the reporter thread may be finished earlier

diff --git a/python/scripts/bench_exact.py b/python/scripts/bench_exact.py
@@ -0,0 +1,44 @@
+from time import time
+
+from usearch.index import search, MetricKind, ScalarKind
+from usearch.compiled import hardware_acceleration
+from faiss import IndexFlatL2, knn
+import numpy as np
+import fire
+
+
+def run(
+    n: int = 10**5,
+    q: int = 10,
+    k: int = 100,
+    ndim: int = 256,
+    half: bool = False,
+):
+    usearch_dtype = ScalarKind.F16 if half else ScalarKind.F32
+    acceleration = hardware_acceleration(
+        dtype=usearch_dtype,
+        ndim=ndim,
+        metric_kind=MetricKind.L2sq,
+    )
+    print("Hardware acceleration in USearch: ", acceleration)
+
+    x = np.random.random((n, ndim))
+    x = x.astype(np.float16) if half else x.astype(np.float32)
+
+    start = time()
+    _ = search(
+        x,
+        x[:q],
+        k,
+        MetricKind.L2sq,
+        exact=True,
+    ).keys
+    print("USearch: ", time() - start)
+
+    start = time()
+    _ = knn(x, x[:q], k)[1]
+    print("FAISS:   ", time() - start)
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/python/scripts/test_tooling.py b/python/scripts/test_tooling.py
@@ -11,7 +11,7 @@
 
 
 dimensions = [3, 97, 256]
-batch_sizes = [1, 77]
+batch_sizes = [1, 77, 100]
 
 
 @pytest.mark.parametrize("rows", batch_sizes)
@@ -56,15 +56,16 @@ def test_exact_search(rows: int, cols: int):
     :param int cols: The number of columns in the matrix.
     """
     original = np.random.rand(rows, cols)
+    keys = np.arange(rows)
     matches: BatchMatches = search(original, original, min(10, rows), exact=True)
     top_matches = (
         [int(m.keys[0]) for m in matches] if rows > 1 else int(matches.keys[0])
     )
-    assert np.all(top_matches == np.arange(rows))
+    assert np.all(top_matches == keys)
 
-    matches: Matches = search(original, original[0], min(10, rows), exact=True)
+    matches: Matches = search(original, original[-1], min(10, rows), exact=True)
     top_match = int(matches.keys[0])
-    assert top_match == 0
+    assert top_match == keys[-1]
 
 
 def test_matches_creation_and_methods():

diff --git a/python/usearch/index.py b/python/usearch/index.py
@@ -180,30 +180,30 @@ def _search_in_compiled(
         vectors = vectors.reshape(1, len(vectors))
     count_vectors = vectors.shape[0]
 
-    def distil_batch(
+    def distill_batch(
         batch_matches: BatchMatches,
     ) -> Union[BatchMatches, Matches]:
         return batch_matches[0] if count_vectors == 1 else batch_matches
 
     # Create progress bar if needed
     if log:
         name = log if isinstance(log, str) else "Search"
-        pbar = tqdm(
+        progress_bar = tqdm(
             desc=name,
             total=count_vectors,
             unit="vector",
         )
 
         def update_progress_bar(processed: int, total: int) -> bool:
-            pbar.update(processed - pbar.n)
+            progress_bar.update(processed - progress_bar.n)
             return progress if progress else True
 
         tuple_ = compiled_callable(vectors, progress=update_progress_bar, **kwargs)
-        pbar.close()
+        progress_bar.close()
     else:
         tuple_ = compiled_callable(vectors, **kwargs)
 
-    return distil_batch(BatchMatches(*tuple_))
+    return distill_batch(BatchMatches(*tuple_))
 
 
 def _add_to_compiled(