Skip to content

Commit

Permalink
Merge pull request #276 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian authored Sep 25, 2023
2 parents 2b7e32b + e1b132e commit ff65b13
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 27 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@
"SLOC",
"sorensen",
"tanimoto",
"tqdm",
"uninitialize",
"unumusearch",
"usearch",
Expand Down
2 changes: 1 addition & 1 deletion include/usearch/index_plugins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1602,7 +1602,7 @@ class flat_hash_multi_set_gt {
auto slot_pointer = bucket_pointer + sizeof(bucket_header_t) + sizeof(element_t) * in_bucket_index;
return {
*reinterpret_cast<bucket_header_t*>(bucket_pointer),
std::uint64_t(1ull) << in_bucket_index,
static_cast<std::uint64_t>(1ull) << in_bucket_index,
*reinterpret_cast<element_t*>(slot_pointer),
};
}
Expand Down
54 changes: 37 additions & 17 deletions python/lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -532,49 +532,69 @@ static void search_typed_brute_force( //
threads = std::thread::hardware_concurrency();

std::size_t tasks_count = static_cast<std::size_t>(dataset_count * queries_count);
executor_default_t executor{threads};

// Progress status
progress_t progress_{progress};
std::atomic<std::size_t> processed{0};

// Allocate temporary memory to store the distance matrix
// Previous version didn't need temporary memory, but the performance was much lower
struct dense_key_and_distance_t {
u32_t offset;
f32_t distance;
};
std::vector<dense_key_and_distance_t> keys_and_distances(tasks_count);

executor_default_t{threads}.dynamic(tasks_count, [&](std::size_t thread_idx, std::size_t task_idx) {
std::size_t dataset_idx = task_idx / queries_count;
std::size_t query_idx = task_idx % queries_count;
auto less = [](dense_key_and_distance_t const& a, dense_key_and_distance_t const& b) {
return a.distance < b.distance;
};

// Allocate temporary memory to store the distance matrix
// Previous version didn't need temporary memory, but the performance was much lower.
// In the new design we keep two buffers - original and transposed, as in-place transpositions
// of non-rectangular matrixes is expensive.
std::vector<dense_key_and_distance_t> keys_and_distances(tasks_count * 2);
dense_key_and_distance_t* keys_and_distances_per_dataset = keys_and_distances.data();
dense_key_and_distance_t* keys_and_distances_per_query = keys_and_distances_per_dataset + tasks_count;
executor.dynamic(dataset_count, [&](std::size_t thread_idx, std::size_t dataset_idx) {
byte_t const* dataset = dataset_data + dataset_idx * dataset_info.strides[0];
for (std::size_t query_idx = 0; query_idx != queries_count; ++query_idx) {
byte_t const* query = queries_data + query_idx * queries_info.strides[0];
distance_t distance = metric(dataset, query);

keys_and_distances[task_idx] = {static_cast<u32_t>(query_idx), static_cast<f32_t>(distance)};
std::size_t task_idx = dataset_idx * queries_count + query_idx;
keys_and_distances_per_dataset[task_idx].offset = static_cast<u32_t>(dataset_idx);
keys_and_distances_per_dataset[task_idx].distance = static_cast<f32_t>(distance);
}

// We don't want to check for signals from multiple threads
++processed;
processed += queries_count;
if (thread_idx == 0)
if (PyErr_CheckSignals() != 0 || !progress_(processed.load(), tasks_count))
return false;
return true;
});
if (processed.load() != tasks_count)
return;

// Transpose in a single thread to avoid contention writing into the same memory buffers
for (std::size_t query_idx = 0; query_idx != queries_count; ++query_idx) {
for (std::size_t dataset_idx = 0; dataset_idx != dataset_count; ++dataset_idx) {
std::size_t from_idx = queries_count * dataset_idx + query_idx;
std::size_t to_idx = dataset_count * query_idx + dataset_idx;
keys_and_distances_per_query[to_idx] = keys_and_distances_per_dataset[from_idx];
}
}

// Partial-sort every query result
executor_default_t{threads}.fixed(queries_count, [&](std::size_t, std::size_t query_idx) {
auto start = keys_and_distances.data() + query_idx * dataset_count;
std::partial_sort(start, start + wanted, start + dataset_count,
[](dense_key_and_distance_t const& a, dense_key_and_distance_t const& b) {
return a.distance < b.distance;
});

executor.fixed(queries_count, [&](std::size_t, std::size_t query_idx) {
dense_key_t* keys = &keys_py2d(query_idx, 0);
distance_t* distances = &distances_py2d(query_idx, 0);
auto start = keys_and_distances_per_query + dataset_count * query_idx;
if (wanted > 1) {
std::partial_sort(start, start + wanted, start + dataset_count, less);
for (std::size_t i = 0; i != wanted; ++i)
keys[i] = static_cast<dense_key_t>(start[i].offset), distances[i] = start[i].distance;
} else {
auto max_it = std::max_element(start, start + dataset_count, less);
keys[0] = static_cast<dense_key_t>(max_it->offset), distances[0] = max_it->distance;
}
});

// At the end report the latest numbers, because the reporter thread may be finished earlier
Expand Down
44 changes: 44 additions & 0 deletions python/scripts/bench_exact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from time import time

from usearch.index import search, MetricKind, ScalarKind
from usearch.compiled import hardware_acceleration
from faiss import IndexFlatL2, knn
import numpy as np
import fire


def run(
n: int = 10**5,
q: int = 10,
k: int = 100,
ndim: int = 256,
half: bool = False,
):
usearch_dtype = ScalarKind.F16 if half else ScalarKind.F32
acceleration = hardware_acceleration(
dtype=usearch_dtype,
ndim=ndim,
metric_kind=MetricKind.L2sq,
)
print("Hardware acceleration in USearch: ", acceleration)

x = np.random.random((n, ndim))
x = x.astype(np.float16) if half else x.astype(np.float32)

start = time()
_ = search(
x,
x[:q],
k,
MetricKind.L2sq,
exact=True,
).keys
print("USearch: ", time() - start)

start = time()
_ = knn(x, x[:q], k)[1]
print("FAISS: ", time() - start)


if __name__ == "__main__":
fire.Fire(run)
9 changes: 5 additions & 4 deletions python/scripts/test_tooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@


dimensions = [3, 97, 256]
batch_sizes = [1, 77]
batch_sizes = [1, 77, 100]


@pytest.mark.parametrize("rows", batch_sizes)
Expand Down Expand Up @@ -56,15 +56,16 @@ def test_exact_search(rows: int, cols: int):
:param int cols: The number of columns in the matrix.
"""
original = np.random.rand(rows, cols)
keys = np.arange(rows)
matches: BatchMatches = search(original, original, min(10, rows), exact=True)
top_matches = (
[int(m.keys[0]) for m in matches] if rows > 1 else int(matches.keys[0])
)
assert np.all(top_matches == np.arange(rows))
assert np.all(top_matches == keys)

matches: Matches = search(original, original[0], min(10, rows), exact=True)
matches: Matches = search(original, original[-1], min(10, rows), exact=True)
top_match = int(matches.keys[0])
assert top_match == 0
assert top_match == keys[-1]


def test_matches_creation_and_methods():
Expand Down
10 changes: 5 additions & 5 deletions python/usearch/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,30 +180,30 @@ def _search_in_compiled(
vectors = vectors.reshape(1, len(vectors))
count_vectors = vectors.shape[0]

def distil_batch(
def distill_batch(
batch_matches: BatchMatches,
) -> Union[BatchMatches, Matches]:
return batch_matches[0] if count_vectors == 1 else batch_matches

# Create progress bar if needed
if log:
name = log if isinstance(log, str) else "Search"
pbar = tqdm(
progress_bar = tqdm(
desc=name,
total=count_vectors,
unit="vector",
)

def update_progress_bar(processed: int, total: int) -> bool:
pbar.update(processed - pbar.n)
progress_bar.update(processed - progress_bar.n)
return progress if progress else True

tuple_ = compiled_callable(vectors, progress=update_progress_bar, **kwargs)
pbar.close()
progress_bar.close()
else:
tuple_ = compiled_callable(vectors, **kwargs)

return distil_batch(BatchMatches(*tuple_))
return distill_batch(BatchMatches(*tuple_))


def _add_to_compiled(
Expand Down

0 comments on commit ff65b13

Please sign in to comment.