Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Saturating the Graph #467

Open
wants to merge 6 commits into
base: main-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions cpp/bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,27 @@ static void single_shot(dataset_at& dataset, index_at& index, bool construct = t
index_many(index, dataset.vectors_count(), ids.data(), dataset.vector(0), dataset.dimensions());
}

// Measure index stats
using index_stats_t = typename index_at::stats_t;
index_stats_t global_stats = index.stats();
index_stats_t base_stats = index.stats(0);
std::size_t base_disconnected_nodes = (*index.disconnected_nodes(0)).count();
std::size_t unreachable_nodes = (*index.unreachable_nodes()).count();
std::printf("-- Nodes: %zu\n", global_stats.nodes);
std::printf("-- Edges: %zu (%.2f %% density)\n", global_stats.edges,
global_stats.edges * 100.f / global_stats.max_edges);
std::printf("-- Edges in base: %zu (%.2f %% density)\n", base_stats.edges,
base_stats.edges * 100.f / base_stats.max_edges);
std::printf("-- Edges above base: %zu (%.2f %% density)\n", global_stats.edges - base_stats.edges,
(global_stats.edges - base_stats.edges) * 100.f / (global_stats.max_edges - base_stats.max_edges));
std::printf("-- Memory usage: %.2e bytes\n", (double)global_stats.allocated_bytes);
std::printf("-- Memory usage in base: %.2e bytes (%.2f %%)\n", (double)base_stats.allocated_bytes,
base_stats.allocated_bytes * 100.f / global_stats.allocated_bytes);
std::printf("-- Disconnected nodes in base: %zu (%.3f %%)\n", base_disconnected_nodes,
base_disconnected_nodes * 100.f / global_stats.nodes);
std::printf("-- Unreachable nodes: %zu (%.3f %%)\n", unreachable_nodes,
unreachable_nodes * 100.f / global_stats.nodes);

// Perform search, evaluate speed
std::vector<default_key_t> found_neighbors(dataset.queries_count() * dataset.neighborhood_size());
std::vector<distance_t> found_distances(dataset.queries_count() * dataset.neighborhood_size());
Expand Down
91 changes: 56 additions & 35 deletions cpp/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,11 @@ void test_minimal_three_vectors(index_at& index, //
expect(index.add(key_first, vector_first.data(), args...));

// Default approximate search
vector_key_t matched_keys[10] = {0};
distance_t matched_distances[10] = {0};
std::size_t matched_count = index.search(vector_first.data(), 5, args...).dump_to(matched_keys, matched_distances);
constexpr std::size_t oversubscribed_results = 777;
vector_key_t matched_keys[oversubscribed_results] = {0};
distance_t matched_distances[oversubscribed_results] = {0};
std::size_t matched_count =
index.search(vector_first.data(), oversubscribed_results, args...).dump_to(matched_keys, matched_distances);

expect(matched_count == 1);
expect(matched_keys[0] == key_first);
Expand All @@ -241,19 +243,20 @@ void test_minimal_three_vectors(index_at& index, //

// Perform single entry search
{
auto search_result = index.search(vector_first.data(), 5, args...);
auto search_result = index.search(vector_first.data(), oversubscribed_results, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
expect(matched_count == 3);
}

// Perform filtered exact search, keeping only odd values
if constexpr (punned_ak) {
auto is_odd = [](vector_key_t key) -> bool { return (key & 1) != 0; };
auto search_result = index.filtered_search(vector_first.data(), 5, is_odd, args...);
auto search_result = index.filtered_search(vector_first.data(), oversubscribed_results, is_odd, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
std::size_t count_odd = is_odd(key_first) + is_odd(key_second) + is_odd(key_third);
expect_eq(matched_count, count_odd);
for (std::size_t i = 0; i < matched_count; i++)
expect(is_odd(matched_keys[i]));
}
Expand Down Expand Up @@ -291,11 +294,12 @@ void test_minimal_three_vectors(index_at& index, //
expect(copy_result);
index_at copied_index = std::move(copy_result.index);

// Perform single entry search
auto search_result = copied_index.search(vector_first.data(), 5, args...);
// Perform single entry search, over-subscribing,
// asking for more data than is present in the index
auto search_result = copied_index.search(vector_first.data(), oversubscribed_results, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
expect(matched_count == 3);

// Validate scans
std::size_t count = 0;
Expand All @@ -313,10 +317,10 @@ void test_minimal_three_vectors(index_at& index, //
index_at moved_index(std::move(index));

// Perform single entry search
auto search_result = moved_index.search(vector_first.data(), 5, args...);
auto search_result = moved_index.search(vector_first.data(), oversubscribed_results, args...);
expect(search_result);
matched_count = search_result.dump_to(matched_keys, matched_distances);
expect(matched_count != 0);
expect(matched_count == 3);

// Validate scans
std::size_t count = 0;
Expand Down Expand Up @@ -354,7 +358,9 @@ void test_minimal_three_vectors(index_at& index, //

// Search again over reconstructed index
{
matched_count = index.search(vector_first.data(), 5, args...).dump_to(matched_keys, matched_distances);
matched_count = index //
.search(vector_first.data(), oversubscribed_results, args...)
.dump_to(matched_keys, matched_distances);
expect_eq(matched_count, 3);
expect_eq(matched_keys[0], key_first);
expect(std::abs(matched_distances[0]) < 0.01);
Expand Down Expand Up @@ -454,13 +460,14 @@ void test_collection(index_at& index, typename index_at::vector_key_t const star
executor_default_t executor(executor_threads);
expect(index.try_reserve({vectors.size(), executor.size()}));
executor.fixed(vectors.size(), [&](std::size_t thread, std::size_t task) {
auto task_data = vectors[task].data();
if constexpr (punned_ak) {
index_add_result_t result = index.add(start_key + task, vectors[task].data(), args...);
index_add_result_t result = index.add(start_key + task, task_data, args...);
expect(result);
} else {
index_update_config_t config;
config.thread = thread;
index_add_result_t result = index.add(start_key + task, vectors[task].data(), args..., config);
index_add_result_t result = index.add(start_key + task, task_data, args..., config);
expect(result);
}
});
Expand All @@ -472,30 +479,42 @@ void test_collection(index_at& index, typename index_at::vector_key_t const star

// Parallel search over the same vectors
executor.fixed(vectors.size(), [&](std::size_t thread, std::size_t task) {
std::size_t max_possible_matches = vectors.size();
std::size_t count_requested = max_possible_matches;
std::size_t const max_possible_matches = vectors.size();
std::size_t const count_requested = max_possible_matches * 10; // Oversubscribe
std::vector<vector_key_t> matched_keys(count_requested);
std::vector<distance_t> matched_distances(count_requested);
std::size_t matched_count = 0;
auto task_data = vectors[task].data();

// Invoke the search kernel
if constexpr (punned_ak) {
index_search_result_t result = index.search(vectors[task].data(), count_requested, args...);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
{
index_search_result_t result = index.search(task_data, count_requested, args...);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
}

if (matched_count != max_possible_matches) {
auto unreachable_count = index.unreachable_nodes();
index_search_result_t other_result = index.search(task_data, count_requested, args...);
}

// In approximate search we can't always expect the right answer to be found
expect_eq(matched_count, max_possible_matches);
expect_eq((vector_key_t)matched_keys[0], (vector_key_t)(start_key + task));
expect(std::abs(matched_distances[0]) < 0.01);
} else {
index_search_config_t config;
config.thread = thread;
index_search_result_t result = index.search(vectors[task].data(), count_requested, args..., config);
index_search_result_t result = index.search(task_data, count_requested, args..., config);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
}

// In approximate search we can't always expect the right answer to be found
// expect_eq(matched_count, max_possible_matches);
// expect_eq(matched_keys[0], start_key + task);
// expect(std::abs(matched_distances[0]) < 0.01);
expect(matched_count <= max_possible_matches);
// In approximate search we can't always expect the right answer to be found
expect_eq(matched_count, max_possible_matches);
expect_eq((vector_key_t)matched_keys[0], (vector_key_t)(start_key + task));
expect(std::abs(matched_distances[0]) < 0.01);
}

// Check that all the distance are monotonically rising
for (std::size_t i = 1; i < matched_count; i++)
Expand Down Expand Up @@ -536,16 +555,17 @@ void test_collection(index_at& index, typename index_at::vector_key_t const star
std::vector<vector_key_t> matched_keys(count_requested);
std::vector<distance_t> matched_distances(count_requested);
std::size_t matched_count = 0;
auto task_data = vectors[task].data();

// Invoke the search kernel
if constexpr (punned_ak) {
index_search_result_t result = index.search(vectors[task].data(), count_requested, args...);
index_search_result_t result = index.search(task_data, count_requested, args...);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
} else {
index_search_config_t config;
config.thread = thread;
index_search_result_t result = index.search(vectors[task].data(), count_requested, args..., config);
index_search_result_t result = index.search(task_data, count_requested, args..., config);
expect(result);
matched_count = result.dump_to(matched_keys.data(), matched_distances.data());
}
Expand Down Expand Up @@ -707,8 +727,8 @@ void test_cosine(std::size_t collection_size, std::size_t dimensions) {
test_collection<false, index_typed_t>(*aligned_index.index, 42, vector_of_vectors, metric);
}
};
for (std::size_t connectivity : {3, 13, 50})
run_templated(connectivity);
// for (std::size_t connectivity : {3, 13, 50})
// run_templated(connectivity);

// Type-punned:
auto run_punned = [&](bool multi, bool enable_key_lookups, std::size_t connectivity) {
Expand Down Expand Up @@ -1103,10 +1123,11 @@ int main(int, char**) {
// Exact search without constructing indexes.
// Great for validating the distance functions.
std::printf("Testing exact search\n");
for (std::size_t dataset_count : {10, 100})
for (std::size_t queries_count : {1, 10})
for (std::size_t wanted_count : {1, 5})
test_exact_search(dataset_count, queries_count, wanted_count);
if (0)
for (std::size_t dataset_count : {10, 100})
for (std::size_t queries_count : {1, 10})
for (std::size_t wanted_count : {1, 5})
test_exact_search(dataset_count, queries_count, wanted_count);

// Make sure the initializers and the algorithms can work with inadequately small values.
// Be warned - this combinatorial explosion of tests produces close to __500'000__ tests!
Expand Down
Loading
Loading