From 232957722dedc56c36dcf6db849214d01eae5e74 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Wed, 12 Oct 2022 16:49:02 +0100 Subject: [PATCH] T-Digest weights are now long long --- CMakeLists.txt | 4 +- src/tdigest.c | 139 +++++++++++++++------------ src/tdigest.h | 14 +-- tests/unit/td_test.c | 220 +++++++++++++++++-------------------------- 4 files changed, 173 insertions(+), 204 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2fbbc3a..7b69609 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,8 +33,8 @@ if(ENABLE_SANITIZERS) message(STATUS "Forcing build type to Debug to run coverage.") set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Choose the type of build." FORCE) - set (CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") - set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_C_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wshadow -Wpointer-arith -Wcast-qual -Wunused -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Werror -fno-omit-frame-pointer -fsanitize=address") + set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wshadow -Wpointer-arith -Wcast-qual -Wunused -Wstrict-prototypes -Wmissing-prototypes -Wwrite-strings -Werror -fno-omit-frame-pointer -fsanitize=address") set (CMAKE_LINKER_FLAGS_DEBUG "${CMAKE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address") ENDIF() diff --git a/src/tdigest.c b/src/tdigest.c index 4ab18c0..7b49d31 100644 --- a/src/tdigest.c +++ b/src/tdigest.c @@ -20,6 +20,14 @@ static inline double weighted_average_sorted(double x1, double w1, double x2, do return __td_max(x1, __td_min(x, x2)); } +static inline bool _tdigest_long_long_add_safe(long long a, long long b) { + if (b < 0) { + return (a >= __LONG_LONG_MAX__ - b); + } else { + return (a <= __LONG_LONG_MAX__ - b); + } +} + static inline double weighted_average(double x1, double w1, double x2, double w2) { if (x1 <= x2) { return weighted_average_sorted(x1, w1, x2, w2); @@ -34,11 +42,17 @@ static void inline swap(double *arr, int i, int j) { arr[j] = temp; } -static unsigned int partition(double *means, double *weights, unsigned int start, unsigned int end, - unsigned int pivot_idx) { +static void inline swap_l(long long *arr, int i, int j) { + const long long temp = arr[i]; + arr[i] = arr[j]; + arr[j] = temp; +} + +static unsigned int partition(double *means, long long *weights, unsigned int start, + unsigned int end, unsigned int pivot_idx) { const double pivotMean = means[pivot_idx]; swap(means, pivot_idx, end); - swap(weights, pivot_idx, end); + swap_l(weights, pivot_idx, end); int i = start - 1; @@ -48,11 +62,11 @@ static unsigned int partition(double *means, double *weights, unsigned int start // increment index of smaller element i++; swap(means, i, j); - swap(weights, i, j); + swap_l(weights, i, j); } } swap(means, i + 1, end); - swap(weights, i + 1, end); + swap_l(weights, i + 1, end); return i + 1; } @@ -64,13 +78,13 @@ static unsigned int partition(double *means, double *weights, unsigned int start * @param start The beginning of the values to sort * @param end The value after the last value to sort */ -void td_qsort(double *means, double *weights, unsigned int start, unsigned int end) { +static void td_qsort(double *means, long long *weights, unsigned int start, unsigned int end) { if (start < end) { // two elements can be directly compared if ((end - start) == 1) { if (means[start] > means[end]) { swap(means, start, end); - swap(weights, start, end); + swap_l(weights, start, end); } return; } @@ -160,7 +174,7 @@ int td_init(double compression, td_histogram_t **result) { td_free(histogram); return 1; } - histogram->nodes_weight = (double *)__td_calloc(capacity, sizeof(double)); + histogram->nodes_weight = (long long *)__td_calloc(capacity, sizeof(long long)); if (!histogram->nodes_weight) { td_free(histogram); return 1; @@ -187,19 +201,22 @@ void td_free(td_histogram_t *histogram) { } int td_merge(td_histogram_t *into, td_histogram_t *from) { - td_compress(into); - td_compress(from); - for (int i = 0; i < from->merged_nodes; i++) { + if (td_compress(into) != 0) + return EDOM; + if (td_compress(from) != 0) + return EDOM; + const int pos = from->merged_nodes + from->unmerged_nodes; + for (int i = 0; i < pos; i++) { const double mean = from->nodes_mean[i]; - const double count = from->nodes_weight[i]; - if (td_add(into, mean, count) != 0) { + const long long weight = from->nodes_weight[i]; + if (td_add(into, mean, weight) != 0) { return EDOM; } } return 0; } -double td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; } +long long td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; } double td_cdf(td_histogram_t *h, double val) { td_compress(h); @@ -229,7 +246,8 @@ double td_cdf(td_histogram_t *h, double val) { const int n = h->merged_nodes; // check for the left tail const double left_centroid_mean = h->nodes_mean[0]; - const double left_centroid_weight = h->nodes_weight[0]; + const double left_centroid_weight = (double)h->nodes_weight[0]; + const double merged_weight_d = (double)h->merged_weight; if (val < left_centroid_mean) { // note that this is different than h->nodes_mean[0] > min // ... this guarantees we divide by non-zero number and interpolation works @@ -237,10 +255,10 @@ double td_cdf(td_histogram_t *h, double val) { if (width > 0) { // must be a sample exactly at min if (val == h->min) { - return 0.5 / h->merged_weight; + return 0.5 / merged_weight_d; } else { return (1 + (val - h->min) / width * (left_centroid_weight / 2 - 1)) / - h->merged_weight; + merged_weight_d; } } else { // this should be redundant with the check val < h->min @@ -249,16 +267,16 @@ double td_cdf(td_histogram_t *h, double val) { } // and the right tail const double right_centroid_mean = h->nodes_mean[n - 1]; - const double right_centroid_weight = h->nodes_weight[n - 1]; + const double right_centroid_weight = (double)h->nodes_weight[n - 1]; if (val > right_centroid_mean) { const double width = h->max - right_centroid_mean; if (width > 0) { if (val == h->max) { - return 1 - 0.5 / h->merged_weight; + return 1 - 0.5 / merged_weight_d; } else { // there has to be a single sample exactly at max const double dq = (1 + (h->max - val) / width * (right_centroid_weight / 2 - 1)) / - h->merged_weight; + merged_weight_d; return 1 - dq; } } else { @@ -276,13 +294,13 @@ double td_cdf(td_histogram_t *h, double val) { // dw will accumulate the weight of all of the centroids at x double dw = 0; while (it < n && h->nodes_mean[it] == val) { - dw += h->nodes_weight[it]; + dw += (double)h->nodes_weight[it]; it++; } - return (weightSoFar + dw / 2) / h->merged_weight; + return (weightSoFar + dw / 2) / (double)h->merged_weight; } else if (h->nodes_mean[it] <= val && val < h->nodes_mean[it + 1]) { - const double node_weight = h->nodes_weight[it]; - const double node_weight_next = h->nodes_weight[it + 1]; + const double node_weight = (double)h->nodes_weight[it]; + const double node_weight_next = (double)h->nodes_weight[it + 1]; const double node_mean = h->nodes_mean[it]; const double node_mean_next = h->nodes_mean[it + 1]; // landed between centroids ... check for floating point madness @@ -297,7 +315,7 @@ double td_cdf(td_histogram_t *h, double val) { if (node_weight_next == 1) { // two singletons means no interpolation // left singleton is in, right is out - return (weightSoFar + 1) / h->merged_weight; + return (weightSoFar + 1) / merged_weight_d; } else { leftExcludedW = 0.5; } @@ -311,19 +329,19 @@ double td_cdf(td_histogram_t *h, double val) { double base = weightSoFar + node_weight / 2 + leftExcludedW; return (base + dwNoSingleton * (val - node_mean) / (node_mean_next - node_mean)) / - h->merged_weight; + merged_weight_d; } else { // this is simply caution against floating point madness // it is conceivable that the centroids will be different // but too near to allow safe interpolation double dw = (node_weight + node_weight_next) / 2; - return (weightSoFar + dw) / h->merged_weight; + return (weightSoFar + dw) / merged_weight_d; } } else { - weightSoFar += h->nodes_weight[it]; + weightSoFar += (double)h->nodes_weight[it]; } } - return 1 - 0.5 / h->merged_weight; + return 1 - 0.5 / merged_weight_d; } static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, const double index, @@ -342,17 +360,18 @@ static double td_internal_iterate_centroids_to_index(const td_histogram_t *h, co // if the right-most centroid has more than one sample, we still know // that one sample occurred at max so we can do some interpolation - const double right_centroid_weight = h->nodes_weight[total_centroids - 1]; + const double right_centroid_weight = (double)h->nodes_weight[total_centroids - 1]; const double right_centroid_mean = h->nodes_mean[total_centroids - 1]; - if (right_centroid_weight > 1 && h->merged_weight - index <= right_centroid_weight / 2) { - return h->max - (h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) * + if (right_centroid_weight > 1 && + (double)h->merged_weight - index <= right_centroid_weight / 2) { + return h->max - ((double)h->merged_weight - index - 1) / (right_centroid_weight / 2 - 1) * (h->max - right_centroid_mean); } for (; *node_pos < total_centroids - 1; (*node_pos)++) { const int i = *node_pos; - const double node_weight = h->nodes_weight[i]; - const double node_weight_next = h->nodes_weight[i + 1]; + const double node_weight = (double)h->nodes_weight[i]; + const double node_weight_next = (double)h->nodes_weight[i + 1]; const double node_mean = h->nodes_mean[i]; const double node_mean_next = h->nodes_mean[i + 1]; const double dw = (node_weight + node_weight_next) / 2; @@ -402,7 +421,7 @@ double td_quantile(td_histogram_t *h, double q) { } // if values were stored in a sorted array, index would be the offset we are interested in - const double index = q * h->merged_weight; + const double index = q * (double)h->merged_weight; // beyond the boundaries, we return min or max // usually, the first centroid will have unit weight so this will make it moot @@ -415,7 +434,7 @@ double td_quantile(td_histogram_t *h, double q) { // if the left centroid has more than one sample, we still know // that one sample occurred at min so we can do some interpolation - const double left_centroid_weight = h->nodes_weight[0]; + const double left_centroid_weight = (double)h->nodes_weight[0]; // in between extremes we interpolate between centroids double weightSoFar = left_centroid_weight / 2; @@ -456,11 +475,7 @@ int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, siz // we know that there are at least two centroids now // if the left centroid has more than one sample, we still know // that one sample occurred at min so we can do some interpolation - const double left_centroid_weight = h->nodes_weight[0]; - - // if the right-most centroid has more than one sample, we still know - // that one sample occurred at max so we can do some interpolation - const double right_centroid_weight = h->nodes_weight[n - 1]; + const double left_centroid_weight = (double)h->nodes_weight[0]; // in between extremes we interpolate between centroids double weightSoFar = left_centroid_weight / 2; @@ -469,7 +484,7 @@ int td_quantiles(td_histogram_t *h, const double *quantiles, double *values, siz // to avoid allocations we use the values array for intermediate computation // i.e. to store the expected cumulative count at each percentile for (size_t qpos = 0; qpos < length; qpos++) { - const double index = quantiles[qpos] * h->merged_weight; + const double index = quantiles[qpos] * (double)h->merged_weight; values[qpos] = td_internal_iterate_centroids_to_index(h, index, left_centroid_weight, n, &weightSoFar, &node_pos); } @@ -483,7 +498,7 @@ static double td_internal_trimmed_mean(const td_histogram_t *h, const double lef double trimmed_count = 0; for (int i = 0; i < h->merged_nodes; i++) { - const double n_weight = h->nodes_weight[i]; + const double n_weight = (double)h->nodes_weight[i]; // Assume the whole centroid falls into the range double count_add = n_weight; @@ -521,8 +536,8 @@ double td_trimmed_mean_symmetric(td_histogram_t *h, double proportion_to_cut) { } /* translate the percentiles to counts */ - const double leftmost_weight = floor(h->merged_weight * proportion_to_cut); - const double rightmost_weight = ceil(h->merged_weight * (1.0 - proportion_to_cut)); + const double leftmost_weight = floor((double)h->merged_weight * proportion_to_cut); + const double rightmost_weight = ceil((double)h->merged_weight * (1.0 - proportion_to_cut)); return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight); } @@ -540,13 +555,13 @@ double td_trimmed_mean(td_histogram_t *h, double leftmost_cut, double rightmost_ } /* translate the percentiles to counts */ - const double leftmost_weight = floor(h->merged_weight * leftmost_cut); - const double rightmost_weight = ceil(h->merged_weight * rightmost_cut); + const double leftmost_weight = floor((double)h->merged_weight * leftmost_cut); + const double rightmost_weight = ceil((double)h->merged_weight * rightmost_cut); return td_internal_trimmed_mean(h, leftmost_weight, rightmost_weight); } -int td_add(td_histogram_t *h, double mean, double weight) { +int td_add(td_histogram_t *h, double mean, long long weight) { if (should_td_compress(h)) { const int overflow_res = td_compress(h); if (overflow_res != 0) @@ -555,10 +570,15 @@ int td_add(td_histogram_t *h, double mean, double weight) { const int pos = next_node(h); if (pos >= h->cap) return EDOM; - const double new_unmerged_weight = h->unmerged_weight + weight; - const double new_total_weight = new_unmerged_weight + h->merged_weight; + if (_tdigest_long_long_add_safe(h->unmerged_weight, weight) == false) + return EDOM; + const long long new_unmerged_weight = h->unmerged_weight + weight; + if (_tdigest_long_long_add_safe(new_unmerged_weight, h->merged_weight) == false) + return EDOM; + const long long new_total_weight = new_unmerged_weight + h->merged_weight; // double-precision overflow detected - const int overflow_res = _check_td_overflow(new_unmerged_weight, new_total_weight); + const int overflow_res = + _check_td_overflow((double)new_unmerged_weight, (double)new_total_weight); if (overflow_res != 0) return overflow_res; @@ -581,9 +601,9 @@ int td_compress(td_histogram_t *h) { } int N = h->merged_nodes + h->unmerged_nodes; td_qsort(h->nodes_mean, h->nodes_weight, 0, N - 1); - const double total_weight = h->merged_weight + h->unmerged_weight; + const double total_weight = (double)h->merged_weight + (double)h->unmerged_weight; // double-precision overflow detected - const int overflow_res = _check_td_overflow(h->unmerged_weight, total_weight); + const int overflow_res = _check_td_overflow((double)h->unmerged_weight, (double)total_weight); if (overflow_res != 0) return overflow_res; if (total_weight <= 1) @@ -600,7 +620,7 @@ int td_compress(td_histogram_t *h) { double weight_so_far = 0; for (int i = 1; i < N; i++) { - const double proposed_weight = h->nodes_weight[cur] + h->nodes_weight[i]; + const double proposed_weight = (double)h->nodes_weight[cur] + (double)h->nodes_weight[i]; const double z = proposed_weight * normalizer; // quantile up to cur const double q0 = weight_so_far / total_weight; @@ -622,7 +642,7 @@ int td_compress(td_histogram_t *h) { h->nodes_mean[cur] = h->nodes_mean[i]; } if (cur != i) { - h->nodes_weight[i] = 0.0; + h->nodes_weight[i] = 0; h->nodes_mean[i] = 0.0; } } @@ -640,16 +660,11 @@ double td_max(td_histogram_t *h) { return h->max; } int td_compression(td_histogram_t *h) { return h->compression; } -const double *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; } +const long long *td_centroids_weight(td_histogram_t *h) { return h->nodes_weight; } const double *td_centroids_mean(td_histogram_t *h) { return h->nodes_mean; } -double td_centroids_weight_at(td_histogram_t *h, int pos) { - if (pos < 0 || pos > h->merged_nodes) { - return NAN; - } - return h->nodes_weight[pos]; -} +long long td_centroids_weight_at(td_histogram_t *h, int pos) { return h->nodes_weight[pos]; } double td_centroids_mean_at(td_histogram_t *h, int pos) { if (pos < 0 || pos > h->merged_nodes) { diff --git a/src/tdigest.h b/src/tdigest.h index 32b35ba..c07436c 100644 --- a/src/tdigest.h +++ b/src/tdigest.h @@ -42,11 +42,11 @@ struct td_histogram { // we run the merge in reverse every other merge to avoid left-to-right bias in merging long long total_compressions; - double merged_weight; - double unmerged_weight; + long long merged_weight; + long long unmerged_weight; double *nodes_mean; - double *nodes_weight; + long long *nodes_weight; }; typedef struct td_histogram td_histogram_t; @@ -106,7 +106,7 @@ void td_reset(td_histogram_t *h); * weight. * */ -int td_add(td_histogram_t *h, double val, double weight); +int td_add(td_histogram_t *h, double val, long long weight); /** * Re-examines a t-digest to determine whether some centroids are redundant. If your data are @@ -190,7 +190,7 @@ int td_compression(td_histogram_t *h); * * @return The sum of the weights on all centroids. */ -double td_size(td_histogram_t *h); +long long td_size(td_histogram_t *h); /** * Returns the number of centroids being used by this TDigest. @@ -222,7 +222,7 @@ double td_max(td_histogram_t *h); * * @return The full centroids weight array. */ -const double *td_centroids_weight(td_histogram_t *h); +const long long *td_centroids_weight(td_histogram_t *h); /** * Get the full centroids mean array for 'this' histogram. @@ -241,7 +241,7 @@ const double *td_centroids_mean(td_histogram_t *h); * * @return The centroid weight. */ -double td_centroids_weight_at(td_histogram_t *h, int pos); +long long td_centroids_weight_at(td_histogram_t *h, int pos); /** * Get the centroid mean for 'this' histogram and 'pos'. diff --git a/tests/unit/td_test.c b/tests/unit/td_test.c index 2434066..272fbfb 100644 --- a/tests/unit/td_test.c +++ b/tests/unit/td_test.c @@ -21,42 +21,11 @@ static double randfrom(double M, double N) { return M + (rand() / (RAND_MAX / (N - M))); } -/** - * Reference implementations for cdf if we have all data. - */ -static double dist_cdf(double x, double *data, int data_length) { - double n1 = 0; - double n2 = 0; - for (size_t i = 0; i < data_length; i++) { - const double v = data[i]; - n1 += (v < x) ? 1 : 0; - n2 += (v == x) ? 1 : 0; - } - return (n1 + n2 / 2.0) / data_length; -} - -/** - * Reference implementations for quantile if we have all data. - */ -static double dist_quantile(double q, double *data, int data_length) { - if (data_length == 0) { - return NAN; - } - double index = q * data_length; - if (index < 0) { - index = 0; - } - if (index > data_length - 1) { - index = data_length - 1; - } - return data[(int)floor(index)]; -} - int tests_run = 0; td_histogram_t *histogram = NULL; -static void load_histograms() { +static void load_histograms(void) { const int compression = 500; int i; @@ -73,13 +42,13 @@ static void load_histograms() { MU_TEST(test_basic) { td_histogram_t *t = td_new(10); mu_assert(t != NULL, "created_histogram"); - mu_assert_double_eq(0, t->unmerged_weight); - mu_assert_double_eq(0, t->merged_weight); + mu_assert_long_eq(0, t->unmerged_weight); + mu_assert_long_eq(0, t->merged_weight); mu_assert(td_add(t, 0.0, 1) == 0, "Insertion"); // with one data point, all quantiles lead to Rome mu_assert_double_eq(0.0, td_quantile(t, .0)); mu_assert_double_eq(0.0, td_quantile(t, 0.5)); - mu_assert_double_eq(0.0, td_quantile(t, 1.0)); + mu_assert_double_eq(0.0, td_quantile(t, 1)); mu_assert(td_add(t, 10.0, 1) == 0, "Insertion"); mu_assert_double_eq(0.0, td_min(t)); mu_assert_double_eq(10.0, td_max(t)); @@ -99,18 +68,21 @@ MU_TEST(test_overflow) { td_histogram_t *t = td_new(10); td_histogram_t *t2 = td_new(10); mu_assert(t != NULL, "created_histogram"); - // mu_assert(t2 != NULL, "created_histogram"); - mu_assert_double_eq(0, t->unmerged_weight); - mu_assert_double_eq(0, t->merged_weight); - mu_assert_double_eq(0, t2->unmerged_weight); - mu_assert_double_eq(0, t2->merged_weight); - for (size_t i = 0; i < 4; i++) { - mu_assert(td_add(t, 5.0, 1e304) == 0, "Insertion of 1e304"); - } - mu_assert(td_add(t, 5.0, 1e304) == EDOM, "5th insertion of 1e305 should overflow"); + mu_assert(t2 != NULL, "created_histogram"); + mu_assert_long_eq(0, t->unmerged_weight); + mu_assert_long_eq(0, t->merged_weight); + mu_assert_long_eq(0, t2->unmerged_weight); + mu_assert_long_eq(0, t2->merged_weight); + mu_assert(td_add(t, 5.0, __LONG_LONG_MAX__ - 1) == 0, "Insertion of __LONG_LONG_MAX__"); + mu_assert(td_add(t, 5.0, __LONG_LONG_MAX__ - 1) == EDOM, + "second insertion of __LONG_LONG_MAX__ should overflow"); + mu_assert_long_eq(__LONG_LONG_MAX__ - 1, t->merged_weight + t->unmerged_weight); // overflow on merge - mu_assert(td_add(t2, 5.0, 1e304) == 0, "First insertion of 1e304"); - mu_assert(td_merge(t2, t) == EDOM, "Merge due to second insertion of 1e300 should overflow"); + mu_assert(td_add(t2, 5.0, __LONG_LONG_MAX__ - 1) == 0, "First insertion of __LONG_LONG_MAX__"); + mu_assert_long_eq(__LONG_LONG_MAX__ - 1, t2->merged_weight + t2->unmerged_weight); + mu_assert(td_add(t2, 1.0, 1) == 0, "Insertion of 1"); + mu_assert(td_add(t2, 5.0, __LONG_LONG_MAX__ - 1) == EDOM, + "Second insertion of __LONG_LONG_MAX__"); td_free(t); td_free(t2); } @@ -122,50 +94,50 @@ MU_TEST(test_overflow_merge) { mu_assert(x != NULL, "created_histogram"); mu_assert(y != NULL, "created_histogram"); mu_assert(z != NULL, "created_histogram"); - mu_assert_double_eq(0, x->unmerged_weight); - mu_assert_double_eq(0, x->merged_weight); - mu_assert_double_eq(0, y->unmerged_weight); - mu_assert_double_eq(0, y->merged_weight); - mu_assert(td_add(x, 1, 1.0) == 0, "Insertion of 1"); - mu_assert(td_add(x, 2, 1.0) == 0, "Insertion of 2"); - mu_assert(td_add(x, 3, 1.0) == 0, "Insertion of 3"); - mu_assert(td_add(x, 4, 1.0) == 0, "Insertion of 4"); - mu_assert(td_add(x, 5, 1.0) == 0, "Insertion of 5"); - mu_assert(td_add(x, 6, 1.0) == 0, "Insertion of 6"); - mu_assert(td_add(x, 7, 1.0) == 0, "Insertion of 7"); - mu_assert(td_add(x, 8, 1.0) == 0, "Insertion of 8"); - mu_assert(td_add(x, 9, 1.0) == 0, "Insertion of 9"); - mu_assert(td_add(x, 10, 1.0) == 0, "Insertion of 10"); - mu_assert(td_add(x, 11, 1.0) == 0, "Insertion of 11"); - mu_assert(td_add(x, 12, 1.0) == 0, "Insertion of 12"); - mu_assert(td_add(x, 13, 1.0) == 0, "Insertion of 13"); - mu_assert(td_add(x, 14, 1.0) == 0, "Insertion of 14"); - mu_assert(td_add(x, 15, 1.0) == 0, "Insertion of 15"); - mu_assert(td_add(x, 16, 1.0) == 0, "Insertion of 16"); - mu_assert(td_add(x, 17, 1.0) == 0, "Insertion of 17"); - mu_assert(td_add(x, 18, 1.0) == 0, "Insertion of 18"); - mu_assert(td_add(x, 19, 1.0) == 0, "Insertion of 19"); - mu_assert(td_add(x, 20, 1.0) == 0, "Insertion of 20"); - mu_assert(td_add(y, 101, 1.0) == 0, "Insertion of 101"); - mu_assert(td_add(y, 102, 1.0) == 0, "Insertion of 102"); - mu_assert(td_add(y, 103, 1.0) == 0, "Insertion of 103"); - mu_assert(td_add(y, 104, 1.0) == 0, "Insertion of 104"); - mu_assert(td_add(y, 105, 1.0) == 0, "Insertion of 105"); - mu_assert(td_add(y, 106, 1.0) == 0, "Insertion of 106"); - mu_assert(td_add(y, 107, 1.0) == 0, "Insertion of 107"); - mu_assert(td_add(y, 108, 1.0) == 0, "Insertion of 108"); - mu_assert(td_add(y, 109, 1.0) == 0, "Insertion of 109"); - mu_assert(td_add(y, 110, 1.0) == 0, "Insertion of 110"); - mu_assert(td_add(y, 111, 1.0) == 0, "Insertion of 111"); - mu_assert(td_add(y, 112, 1.0) == 0, "Insertion of 112"); - mu_assert(td_add(y, 113, 1.0) == 0, "Insertion of 113"); - mu_assert(td_add(y, 114, 1.0) == 0, "Insertion of 114"); - mu_assert(td_add(y, 115, 1.0) == 0, "Insertion of 115"); - mu_assert(td_add(y, 116, 1.0) == 0, "Insertion of 116"); - mu_assert(td_add(y, 117, 1.0) == 0, "Insertion of 117"); - mu_assert(td_add(y, 118, 1.0) == 0, "Insertion of 118"); - mu_assert(td_add(y, 119, 1.0) == 0, "Insertion of 119"); - mu_assert(td_add(y, 120, 1.0) == 0, "Insertion of 120"); + mu_assert_long_eq(0, x->unmerged_weight); + mu_assert_long_eq(0, x->merged_weight); + mu_assert_long_eq(0, y->unmerged_weight); + mu_assert_long_eq(0, y->merged_weight); + mu_assert(td_add(x, 1, 1) == 0, "Insertion of 1"); + mu_assert(td_add(x, 2, 1) == 0, "Insertion of 2"); + mu_assert(td_add(x, 3, 1) == 0, "Insertion of 3"); + mu_assert(td_add(x, 4, 1) == 0, "Insertion of 4"); + mu_assert(td_add(x, 5, 1) == 0, "Insertion of 5"); + mu_assert(td_add(x, 6, 1) == 0, "Insertion of 6"); + mu_assert(td_add(x, 7, 1) == 0, "Insertion of 7"); + mu_assert(td_add(x, 8, 1) == 0, "Insertion of 8"); + mu_assert(td_add(x, 9, 1) == 0, "Insertion of 9"); + mu_assert(td_add(x, 10, 1) == 0, "Insertion of 10"); + mu_assert(td_add(x, 11, 1) == 0, "Insertion of 11"); + mu_assert(td_add(x, 12, 1) == 0, "Insertion of 12"); + mu_assert(td_add(x, 13, 1) == 0, "Insertion of 13"); + mu_assert(td_add(x, 14, 1) == 0, "Insertion of 14"); + mu_assert(td_add(x, 15, 1) == 0, "Insertion of 15"); + mu_assert(td_add(x, 16, 1) == 0, "Insertion of 16"); + mu_assert(td_add(x, 17, 1) == 0, "Insertion of 17"); + mu_assert(td_add(x, 18, 1) == 0, "Insertion of 18"); + mu_assert(td_add(x, 19, 1) == 0, "Insertion of 19"); + mu_assert(td_add(x, 20, 1) == 0, "Insertion of 20"); + mu_assert(td_add(y, 101, 1) == 0, "Insertion of 101"); + mu_assert(td_add(y, 102, 1) == 0, "Insertion of 102"); + mu_assert(td_add(y, 103, 1) == 0, "Insertion of 103"); + mu_assert(td_add(y, 104, 1) == 0, "Insertion of 104"); + mu_assert(td_add(y, 105, 1) == 0, "Insertion of 105"); + mu_assert(td_add(y, 106, 1) == 0, "Insertion of 106"); + mu_assert(td_add(y, 107, 1) == 0, "Insertion of 107"); + mu_assert(td_add(y, 108, 1) == 0, "Insertion of 108"); + mu_assert(td_add(y, 109, 1) == 0, "Insertion of 109"); + mu_assert(td_add(y, 110, 1) == 0, "Insertion of 110"); + mu_assert(td_add(y, 111, 1) == 0, "Insertion of 111"); + mu_assert(td_add(y, 112, 1) == 0, "Insertion of 112"); + mu_assert(td_add(y, 113, 1) == 0, "Insertion of 113"); + mu_assert(td_add(y, 114, 1) == 0, "Insertion of 114"); + mu_assert(td_add(y, 115, 1) == 0, "Insertion of 115"); + mu_assert(td_add(y, 116, 1) == 0, "Insertion of 116"); + mu_assert(td_add(y, 117, 1) == 0, "Insertion of 117"); + mu_assert(td_add(y, 118, 1) == 0, "Insertion of 118"); + mu_assert(td_add(y, 119, 1) == 0, "Insertion of 119"); + mu_assert(td_add(y, 120, 1) == 0, "Insertion of 120"); for (size_t i = 0; i < 10; i++) { td_histogram_t *zz = td_new(10); @@ -193,16 +165,17 @@ MU_TEST(test_overflow_merge) { MU_TEST(test_quantile_interpolations) { td_histogram_t *t = td_new(10); mu_assert(t != NULL, "created_histogram"); - mu_assert_double_eq(0, t->unmerged_weight); - mu_assert_double_eq(0, t->merged_weight); + mu_assert_long_eq(0, t->unmerged_weight); + mu_assert_long_eq(0, t->merged_weight); mu_assert(td_add(t, 5.0, 2) == 0, "add"); - mu_assert_double_eq(1, t->unmerged_weight); + mu_assert_long_eq(2, t->unmerged_weight); // with one data point, all quantiles lead to Rome - mu_assert_double_eq(0.0, td_quantile(t, .0)); - mu_assert_double_eq(0.0, td_quantile(t, 0.5)); + mu_assert_double_eq(5.0, td_quantile(t, .0)); + mu_assert_double_eq(5.0, td_quantile(t, 0.5)); + mu_assert_double_eq(5.0, td_quantile(t, 1.0)); mu_assert(td_compress(t) == 0, "compress"); - mu_assert_double_eq(0, t->unmerged_weight); - mu_assert_double_eq(2.0, t->merged_weight); + mu_assert_long_eq(0, t->unmerged_weight); + mu_assert_long_eq(2, t->merged_weight); mu_assert(td_add(t, 100.0, 1) == 0, "Insertion"); // we know that there are at least two centroids now td_free(t); @@ -220,8 +193,8 @@ MU_TEST(test_trimmed_mean_simple) { */ td_histogram_t *t = td_new(100); mu_assert(t != NULL, "created_histogram"); - mu_assert_double_eq(0, t->unmerged_weight); - mu_assert_double_eq(0, t->merged_weight); + mu_assert_long_eq(0, t->unmerged_weight); + mu_assert_long_eq(0, t->merged_weight); // stats.trim_mean([], 0.49) // nan mu_assert_double_eq(NAN, td_trimmed_mean_symmetric(t, .49)); @@ -238,17 +211,17 @@ MU_TEST(test_trimmed_mean_simple) { // 5.0 // stats.trim_mean(x, 0.0) mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .0)); - mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1.0)); + mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1)); // 5.0 mu_assert(td_add(t, 5.0, 2) == 0, "Insertion"); mu_assert_double_eq(5, td_trimmed_mean_symmetric(t, .0)); - mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1.0)); + mu_assert_double_eq(5, td_trimmed_mean(t, 0.0, 1)); mu_assert(td_add(t, 10.0, 1) == 0, "Insertion"); mu_assert(td_add(t, 15.0, 3) == 0, "Insertion"); // stats.trim_mean(x, 0.0) // 10.0 mu_assert_double_eq(10, td_trimmed_mean_symmetric(t, .0)); - mu_assert_double_eq(10, td_trimmed_mean(t, 0.0, 1.0)); + mu_assert_double_eq(10, td_trimmed_mean(t, 0.0, 1)); // trimmed mean and mean should lead to 10 in here // stats.trim_mean(x, 0.1) // 10.0 @@ -272,8 +245,8 @@ MU_TEST(test_trimmed_mean_complex) { */ td_histogram_t *t = td_new(100); mu_assert(t != NULL, "created_histogram"); - mu_assert_double_eq(0, t->unmerged_weight); - mu_assert_double_eq(0, t->merged_weight); + mu_assert_long_eq(0, t->unmerged_weight); + mu_assert_long_eq(0, t->merged_weight); for (int i = 0; i < 20; ++i) { mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); } @@ -285,8 +258,8 @@ MU_TEST(test_trimmed_mean_complex) { td_free(t); t = td_new(100); mu_assert(t != NULL, "created_histogram"); - mu_assert_double_eq(0, t->unmerged_weight); - mu_assert_double_eq(0, t->merged_weight); + mu_assert_long_eq(0, t->unmerged_weight); + mu_assert_long_eq(0, t->merged_weight); for (int i = 0; i < 200; ++i) { mu_assert(td_add(t, (double)i, 1) == 0, "Insertion"); } @@ -326,20 +299,20 @@ MU_TEST(test_trimmed_mean_complex) { MU_TEST(test_compress_small) { td_histogram_t *t = td_new(100); mu_assert(t != NULL, "created_histogram"); - mu_assert(td_add(t, 1.0, 1.0) == 0, "Insertion"); + mu_assert(td_add(t, 1.0, 1) == 0, "Insertion"); mu_assert_double_eq(1.0, td_min(t)); mu_assert_double_eq(1.0, td_max(t)); mu_assert_double_eq(1.0, td_size(t)); mu_assert_int_eq(1, td_centroid_count(t)); mu_assert_long_eq(0, t->total_compressions); mu_assert_double_eq(1.0, td_centroids_mean_at(t, 0)); - mu_assert_double_eq(1.0, td_centroids_weight_at(t, 0)); + mu_assert_long_eq(1, td_centroids_weight_at(t, 0)); mu_assert_int_eq(1, t->unmerged_nodes); mu_assert_int_eq(0, t->merged_nodes); mu_assert(td_compress(t) == 0, "compress"); - mu_assert_int_eq(1, t->unmerged_nodes + t->merged_nodes); + mu_assert_long_eq(1, t->unmerged_nodes + t->merged_nodes); mu_assert_double_eq(1.0, td_centroids_mean_at(t, 0)); - mu_assert_double_eq(1.0, td_centroids_weight_at(t, 0)); + mu_assert_long_eq(1, td_centroids_weight_at(t, 0)); mu_assert_double_eq(1.0, td_quantile(t, 0.001)); mu_assert_double_eq(1.0, td_quantile(t, 0.01)); mu_assert_double_eq(1.0, td_quantile(t, 0.5)); @@ -387,7 +360,7 @@ MU_TEST(test_negative_values) { mu_assert_double_eq_epsilon(-98.5, td_quantile(t, 0.01), 0.75); mu_assert_double_eq_epsilon(98.5, td_quantile(t, 0.99), 0.75); mu_assert_double_eq(100, td_quantile(t, 0.999)); - mu_assert_double_eq(100, td_quantile(t, 1.0)); + mu_assert_double_eq(100, td_quantile(t, 1)); td_free(t); } @@ -410,7 +383,7 @@ MU_TEST(test_negative_values_merge) { mu_assert_double_eq_epsilon(-98.5, td_quantile(d1, 0.01), 0.75); mu_assert_double_eq_epsilon(98.5, td_quantile(d1, 0.99), 0.75); mu_assert_double_eq(100, td_quantile(d1, 0.999)); - mu_assert_double_eq(100, td_quantile(d1, 1.0)); + mu_assert_double_eq(100, td_quantile(d1, 1)); td_free(d1); td_free(d2); } @@ -432,9 +405,6 @@ MU_TEST(test_nans) { mu_assert(isnan(td_quantile(t, 0)), "empty value at 0"); mu_assert(isnan(td_quantile(t, 0.5)), "empty value at .5"); mu_assert(isnan(td_quantile(t, 1)), "empty value at 1"); - mu_assert(isnan(td_centroids_weight_at(t, 1)), - "td_centroids_weight_at on pos > h->merged_nodes"); - mu_assert(isnan(td_centroids_weight_at(t, -1)), "td_centroids_weight_at on pos < 0"); mu_assert(isnan(td_centroids_mean_at(t, 1)), "td_centroids_mean_at on pos > h->merged_nodes"); mu_assert(isnan(td_centroids_mean_at(t, -1)), "td_centroids_mean_at on pos < 0"); mu_assert(td_add(t, 1, 1) == 0, "Insertion"); @@ -541,23 +511,6 @@ MU_TEST(test_td_init) { td_free(t); } -bool compare_double(double a, double b, double delta) { - if (fabs(a - b) < delta) { - return true; - } - - printf("[compare_double] fabs(%f, %f) < %f == false\n", a, b, delta); - return false; -} - -static bool compare_values(double a, double b, double variation) { - return compare_double(a, b, b * variation); -} - -static bool compare_percentile(int64_t a, double b, double variation) { - return compare_values((double)a, b, variation); -} - MU_TEST(test_quantiles) { load_histograms(); mu_assert_double_eq_epsilon(0.0, td_quantile(histogram, 0.0), 0.001); @@ -573,7 +526,7 @@ MU_TEST(test_quantiles) { mu_assert_double_eq_epsilon(9.99, td_quantile(histogram, 0.999), 0.01); mu_assert_double_eq_epsilon(9.999, td_quantile(histogram, 0.9999), 0.01); mu_assert_double_eq_epsilon(9.9999, td_quantile(histogram, 0.99999), 0.01); - mu_assert_double_eq_epsilon(10.0, td_quantile(histogram, 1.0), 0.001); + mu_assert_double_eq_epsilon(10.0, td_quantile(histogram, 1), 0.001); } MU_TEST(test_quantiles_multiple) { @@ -643,6 +596,7 @@ MU_TEST_SUITE(test_suite) { MU_RUN_TEST(test_td_min); MU_RUN_TEST(test_quantiles); MU_RUN_TEST(test_quantiles_multiple); + MU_RUN_TEST(test_quantile_interpolations); MU_RUN_TEST(test_trimmed_mean_simple); MU_RUN_TEST(test_trimmed_mean_complex); MU_RUN_TEST(test_overflow);