Skip to content

Commit f9fc617

Browse files
authored
Merge pull request #513 from unum-cloud/main-dev-cluster
Clustering
2 parents 0dac789 + 91c0bcb commit f9fc617

File tree

12 files changed

+754
-38
lines changed

12 files changed

+754
-38
lines changed

.vscode/settings.json

+1
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@
157157
"ivdep",
158158
"jaccard",
159159
"Jemalloc",
160+
"kmeans",
160161
"Kullback",
161162
"Leibler",
162163
"libjemalloc",

BENCHMARKS.md

+12-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ Within this repository you will find two commonly used utilities:
5858
- `cpp/bench.cpp` the produces the `bench_cpp` binary for broad USearch benchmarks.
5959
- `python/bench.py` and `python/bench.ipynb` for interactive charts against FAISS.
6060

61+
### C++ Benchmarking Utilities
62+
6163
To achieve best highest results we suggest compiling locally for the target architecture.
6264

6365
```sh
@@ -147,11 +149,20 @@ build_profile/bench_cpp \
147149
--cos
148150
```
149151

150-
151152
> Optional parameters include `connectivity`, `expansion_add`, `expansion_search`.
152153
153154
For Python, jut open the Jupyter Notebook and start playing around.
154155

156+
### Python Benchmarking Utilities
157+
158+
Several benchmarking suites are available for Python: approximate search, exact search, and clustering.
159+
160+
```sh
161+
python/scripts/bench.py --help
162+
python/scripts/bench_exact.py --help
163+
python/scripts/bench_cluster.py --help
164+
```
165+
155166
## Datasets
156167

157168
BigANN benchmark is a good starting point, if you are searching for large collections of high-dimensional vectors.

CONTRIBUTING.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ cibuildwheel --platform macos # works only on MacOS
200200
cibuildwheel --platform windows # works only on Windows
201201
```
202202

203-
You may need root previligies for multi-architecture builds:
203+
You may need root privileges for multi-architecture builds:
204204

205205
```sh
206206
sudo $(which cibuildwheel) --platform linux

cpp/test.cpp

+16
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,22 @@ int main(int, char**) {
11001100
test_uint40();
11011101
test_cosine<float, std::int64_t, uint40_t>(10, 10);
11021102

1103+
// Test plugins, like K-Means clustering.
1104+
{
1105+
std::size_t vectors_count = 1000, centroids_count = 10, dimensions = 256;
1106+
kmeans_clustering_t clustering;
1107+
clustering.max_iterations = 2;
1108+
std::vector<float> vectors(vectors_count * dimensions), centroids(centroids_count * dimensions);
1109+
matrix_slice_gt<float const> vectors_slice(vectors.data(), dimensions, vectors_count);
1110+
matrix_slice_gt<float> centroids_slice(centroids.data(), dimensions, centroids_count);
1111+
std::generate(vectors.begin(), vectors.end(), [] { return float(std::rand()) / float(INT_MAX); });
1112+
std::vector<std::size_t> assignments(vectors_count);
1113+
std::vector<distance_punned_t> distances(vectors_count);
1114+
auto clustering_result = clustering(vectors_slice, centroids_slice, {assignments.data(), assignments.size()},
1115+
{distances.data(), distances.size()});
1116+
expect(clustering_result);
1117+
}
1118+
11031119
// Exact search without constructing indexes.
11041120
// Great for validating the distance functions.
11051121
std::printf("Testing exact search\n");

include/usearch/index.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ template <typename scalar_at, typename allocator_at = std::allocator<scalar_at>>
370370
void reset() noexcept {
371371
if (!std::is_trivially_destructible<scalar_at>::value)
372372
for (std::size_t i = 0; i != size_; ++i)
373-
destroy_at(data_ + i);
373+
unum::usearch::destroy_at(data_ + i); //< Facing some symbol visibility/ambiguity issues
374374
allocator_at{}.deallocate(data_, size_);
375375
data_ = nullptr;
376376
size_ = 0;

include/usearch/index_dense.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2254,4 +2254,4 @@ static join_result_t join( //
22542254
}
22552255

22562256
} // namespace usearch
2257-
} // namespace unum
2257+
} // namespace unum

include/usearch/index_plugins.hpp

+342-6
Large diffs are not rendered by default.

python/lib.cpp

+116-23
Original file line numberDiff line numberDiff line change
@@ -452,7 +452,7 @@ static py::tuple search_many_in_index( //
452452
}
453453

454454
/**
455-
* @brief Brute-force exact search implementation, compatible with
455+
* @brief Brute-force @b exact search implementation, compatible with
456456
* NumPy-like Tensors and other objects supporting Buffer Protocol.
457457
*/
458458
static py::tuple search_many_brute_force( //
@@ -545,6 +545,81 @@ static py::tuple search_many_brute_force( //
545545
return results;
546546
}
547547

548+
/**
549+
* @brief Brute-force @b K-Means clustering, compatible with
550+
* NumPy-like Tensors and other objects supporting Buffer Protocol.
551+
*/
552+
static py::tuple cluster_many_brute_force( //
553+
py::buffer dataset, //
554+
std::size_t wanted, //
555+
std::size_t max_iterations, //
556+
double inertia_threshold, //
557+
double max_seconds, //
558+
double min_shifts, //
559+
std::uint64_t seed, //
560+
std::size_t threads, //
561+
scalar_kind_t scalar_kind, //
562+
metric_kind_t metric_kind, //
563+
progress_func_t const& progress_func) {
564+
565+
using distance_t = typename kmeans_clustering_t::distance_t;
566+
py::buffer_info dataset_info = dataset.request();
567+
if (dataset_info.ndim != 2)
568+
throw std::invalid_argument("Expects a matrix (rank-2 tensor) of dataset to cluster!");
569+
570+
std::size_t dataset_count = static_cast<std::size_t>(dataset_info.shape[0]);
571+
std::size_t dataset_dimensions = static_cast<std::size_t>(dataset_info.shape[1]);
572+
std::size_t dataset_stride = static_cast<std::size_t>(dataset_info.strides[0]);
573+
scalar_kind_t dataset_kind = numpy_string_to_kind(dataset_info.format);
574+
std::size_t bytes_per_scalar = bits_per_scalar_word(dataset_kind) / CHAR_BIT;
575+
576+
std::vector<std::size_t> point_to_centroid_index(dataset_count, 0);
577+
std::vector<distance_t> point_to_centroid_distance(dataset_count, 0);
578+
std::vector<byte_t> centroids(wanted * dataset_dimensions * bytes_per_scalar, 0);
579+
580+
if (!threads)
581+
threads = std::thread::hardware_concurrency();
582+
583+
// Dispatch brute-force search
584+
progress_t progress{progress_func};
585+
executor_default_t executor{threads};
586+
kmeans_clustering_t engine;
587+
engine.metric_kind = metric_kind;
588+
engine.quantization_kind = scalar_kind;
589+
engine.max_iterations = max_iterations;
590+
engine.min_shifts = min_shifts;
591+
engine.max_seconds = max_seconds;
592+
engine.inertia_threshold = inertia_threshold;
593+
594+
kmeans_clustering_result_t result = engine( //
595+
reinterpret_cast<byte_t const*>(dataset_info.ptr), dataset_count, dataset_stride, //
596+
centroids.data(), wanted, dataset_dimensions * bytes_per_scalar, //
597+
point_to_centroid_index.data(), point_to_centroid_distance.data(), dataset_kind, dataset_dimensions, executor,
598+
[&](std::size_t passed, std::size_t total) { return PyErr_CheckSignals() == 0 && progress(passed, total); });
599+
600+
if (!result)
601+
throw std::runtime_error(result.error.release());
602+
603+
// Following constructor doesn't seem to be documented, but it's used in the source code of `pybind11`
604+
// https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/numpy.h#L918-L919
605+
// https://github.com/pybind/pybind11/blob/aeda49ed0b4e6e8abba7abc265ace86a6c26ba66/include/pybind11/buffer_info.h#L60-L75
606+
py::buffer_info centroids_info;
607+
centroids_info.ptr = reinterpret_cast<void*>(centroids.data());
608+
centroids_info.itemsize = dataset_info.itemsize;
609+
centroids_info.size = wanted * dataset_dimensions;
610+
centroids_info.format = dataset_info.format;
611+
centroids_info.ndim = 2;
612+
centroids_info.shape = {wanted, dataset_dimensions};
613+
centroids_info.strides = {dataset_dimensions * bytes_per_scalar, bytes_per_scalar};
614+
615+
py::tuple results(3);
616+
results[0] = py::array_t<std::size_t>({dataset_count}, point_to_centroid_index.data());
617+
results[1] = py::array_t<distance_t>({dataset_count}, point_to_centroid_distance.data());
618+
results[2] = py::array(centroids_info);
619+
620+
return results;
621+
}
622+
548623
template <typename scalar_at> struct rows_lookup_gt {
549624
byte_t* data_;
550625
std::size_t stride_;
@@ -936,16 +1011,33 @@ PYBIND11_MODULE(compiled, m) {
9361011
return index_metadata(meta);
9371012
});
9381013

939-
m.def("exact_search", &search_many_brute_force, //
940-
py::arg("dataset"), //
941-
py::arg("queries"), //
942-
py::arg("count") = 10, //
943-
py::kw_only(), //
944-
py::arg("threads") = 0, //
945-
py::arg("metric_kind") = metric_kind_t::cos_k, //
946-
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
947-
py::arg("metric_pointer") = 0, //
948-
py::arg("progress") = nullptr //
1014+
m.def( //
1015+
"exact_search", &search_many_brute_force, //
1016+
py::arg("dataset"), //
1017+
py::arg("queries"), //
1018+
py::arg("count") = 10, //
1019+
py::kw_only(), //
1020+
py::arg("threads") = 0, //
1021+
py::arg("metric_kind") = metric_kind_t::cos_k, //
1022+
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
1023+
py::arg("metric_pointer") = 0, //
1024+
py::arg("progress") = nullptr //
1025+
);
1026+
1027+
m.def( //
1028+
"kmeans", &cluster_many_brute_force, //
1029+
py::arg("dataset"), //
1030+
py::arg("count") = 10, //
1031+
py::kw_only(), //
1032+
py::arg("max_iterations") = kmeans_clustering_t::max_iterations_default_k, //
1033+
py::arg("inertia_threshold") = kmeans_clustering_t::inertia_threshold_default_k, //
1034+
py::arg("max_seconds") = kmeans_clustering_t::max_seconds_default_k, //
1035+
py::arg("min_shifts") = kmeans_clustering_t::min_shifts_default_k, //
1036+
py::arg("seed") = 0, //
1037+
py::arg("threads") = 0, //
1038+
py::arg("dtype") = scalar_kind_t::bf16_k, //
1039+
py::arg("metric_kind") = metric_kind_t::l2sq_k, //
1040+
py::arg("progress") = nullptr //
9491041
);
9501042

9511043
m.def(
@@ -961,18 +1053,19 @@ PYBIND11_MODULE(compiled, m) {
9611053

9621054
auto i = py::class_<dense_index_py_t, std::shared_ptr<dense_index_py_t>>(m, "Index");
9631055

964-
i.def(py::init(&make_index), //
965-
py::kw_only(), //
966-
py::arg("ndim") = 0, //
967-
py::arg("dtype") = scalar_kind_t::f32_k, //
968-
py::arg("connectivity") = default_connectivity(), //
969-
py::arg("expansion_add") = default_expansion_add(), //
970-
py::arg("expansion_search") = default_expansion_search(), //
971-
py::arg("metric_kind") = metric_kind_t::cos_k, //
972-
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
973-
py::arg("metric_pointer") = 0, //
974-
py::arg("multi") = false, //
975-
py::arg("enable_key_lookups") = true //
1056+
i.def( //
1057+
py::init(&make_index), //
1058+
py::kw_only(), //
1059+
py::arg("ndim") = 0, //
1060+
py::arg("dtype") = scalar_kind_t::f32_k, //
1061+
py::arg("connectivity") = default_connectivity(), //
1062+
py::arg("expansion_add") = default_expansion_add(), //
1063+
py::arg("expansion_search") = default_expansion_search(), //
1064+
py::arg("metric_kind") = metric_kind_t::cos_k, //
1065+
py::arg("metric_signature") = metric_punned_signature_t::array_array_k, //
1066+
py::arg("metric_pointer") = 0, //
1067+
py::arg("multi") = false, //
1068+
py::arg("enable_key_lookups") = true //
9761069
);
9771070

9781071
i.def( //

python/scripts/bench_cluster.py

+136
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import argparse
4+
5+
import numpy as np
6+
import faiss
7+
from tqdm import tqdm
8+
9+
import usearch
10+
from usearch.index import kmeans
11+
from usearch.io import load_matrix
12+
13+
14+
def evaluate_clustering_euclidean(X, labels, centroids):
15+
"""Evaluate clustering quality as average distance to centroids"""
16+
distances = np.linalg.norm(X - centroids[labels], axis=1)
17+
return np.mean(distances)
18+
19+
20+
def evaluate_clustering_cosine(X, labels, centroids):
21+
"""Evaluate clustering quality as average cosine distance to centroids"""
22+
23+
# Normalize both data points and centroids
24+
X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
25+
centroids_normalized = centroids / np.linalg.norm(centroids, axis=1, keepdims=True)
26+
27+
# Compute cosine similarity using dot product
28+
cosine_similarities = np.sum(X_normalized * centroids_normalized[labels], axis=1)
29+
30+
# Convert cosine similarity to cosine distance
31+
cosine_distances = 1 - cosine_similarities
32+
33+
# Return the average cosine distance
34+
return np.mean(cosine_distances)
35+
36+
37+
def numpy_initialize_centroids(X, k):
38+
"""Randomly choose k data points as initial centroids"""
39+
indices = np.random.choice(X.shape[0], k, replace=False)
40+
return X[indices]
41+
42+
43+
def numpy_assign_clusters(X, centroids):
44+
"""Assign each data point to the nearest centroid (numpy NumPy implementation)"""
45+
distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
46+
return np.argmin(distances, axis=1)
47+
48+
49+
def numpy_update_centroids(X, labels, k):
50+
"""Compute new centroids as the mean of all data points assigned to each cluster"""
51+
return np.array([X[labels == i].mean(axis=0) for i in range(k)])
52+
53+
54+
def cluster_with_numpy(X, k, max_iters=100, tol=1e-4):
55+
centroids = numpy_initialize_centroids(X, k)
56+
57+
for i in tqdm(range(max_iters), desc="KMeans Iterations"):
58+
labels = numpy_assign_clusters(X, centroids)
59+
new_centroids = numpy_update_centroids(X, labels, k)
60+
61+
if np.linalg.norm(new_centroids - centroids) < tol:
62+
break
63+
64+
centroids = new_centroids
65+
66+
return labels, centroids
67+
68+
69+
def cluster_with_faiss(X, k, max_iters=100):
70+
# Docs: https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization
71+
# Header: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.h
72+
# Source: https://github.com/facebookresearch/faiss/blob/main/faiss/Clustering.cpp
73+
verbose = False
74+
d: int = X.shape[1]
75+
kmeans = faiss.Kmeans(d, k, niter=max_iters, verbose=verbose)
76+
kmeans.train(X)
77+
D, I = kmeans.index.search(X, 1)
78+
return I.flatten(), kmeans.centroids
79+
80+
81+
def cluster_with_usearch(X, k, max_iters=100):
82+
assignments, _, centroids = kmeans(X, k, max_iterations=max_iters)
83+
return assignments, centroids
84+
85+
86+
def main():
87+
parser = argparse.ArgumentParser(description="Compare KMeans clustering algorithms")
88+
parser.add_argument("--vectors", type=str, required=True, help="Path to binary matrix file")
89+
parser.add_argument("-k", default=10, type=int, required=True, help="Number of centroids")
90+
parser.add_argument("-i", default=100, type=int, help="Upper bound on number of iterations")
91+
parser.add_argument("-n", type=int, help="Upper bound on number of points to use")
92+
parser.add_argument(
93+
"--method",
94+
type=str,
95+
choices=["numpy", "faiss", "usearch"],
96+
default="numpy",
97+
help="Clustering backend",
98+
)
99+
100+
args = parser.parse_args()
101+
102+
max_iters = args.i
103+
X = load_matrix(args.vectors, count_rows=args.n)
104+
k = args.k
105+
method = args.method
106+
107+
time_before = os.times()
108+
if method == "usearch":
109+
labels, centroids = cluster_with_usearch(X, k, max_iters=max_iters)
110+
elif method == "faiss":
111+
labels, centroids = cluster_with_faiss(X, k, max_iters=max_iters)
112+
else:
113+
labels, centroids = cluster_with_numpy(X, k, max_iters=max_iters)
114+
time_after = os.times()
115+
time_duration = time_after[0] - time_before[0]
116+
print(f"Time: {time_duration:.2f}s, {time_duration / max_iters:.2f}s per iteration")
117+
118+
quality = evaluate_clustering_euclidean(X, labels, centroids)
119+
quality_cosine = evaluate_clustering_cosine(X, labels, centroids)
120+
print(f"Clustering quality (average distance to centroids): {quality:.4f}, cosine: {quality_cosine:.4f}")
121+
122+
# Let's compare it to some random uniform assignment
123+
random_labels = np.random.randint(0, k, size=X.shape[0])
124+
random_quality = evaluate_clustering_euclidean(X, random_labels, centroids)
125+
random_quality_cosine = evaluate_clustering_cosine(X, random_labels, centroids)
126+
print(f"... while random assignment quality: {random_quality:.4f}, cosine: {random_quality_cosine:.4f}")
127+
128+
cluster_sizes = np.unique(labels, return_counts=True)[1]
129+
cluster_sizes_mean = np.mean(cluster_sizes)
130+
cluster_sizes_stddev = np.std(cluster_sizes)
131+
print(f"Cluster sizes: {cluster_sizes_mean:.2f} ± {cluster_sizes_stddev:.2f}")
132+
print(cluster_sizes)
133+
134+
135+
if __name__ == "__main__":
136+
main()

0 commit comments

Comments
 (0)