Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug40168+34088 035 v2 #2123

Open
wants to merge 15 commits into
base: maint-0.3.5
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changes/bug34088
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
o Minor bugfixes (circuit build timeout):
- Remove max_time calculation and associated warn from circuit build
timeout 'alpha' parameter estimation, as this is no longer needed
by our new estimator from 40168. Fixes bug 34088; bugfix on 0.2.2.9-alpha.
16 changes: 16 additions & 0 deletions changes/bug40168
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
o Minor bugfixes (circuit build timeout):
- Improve the accuracy of our circuit build timeout calculation for 60%,
70%, and 80% build rates for various guard choices. We now use a maximum
likelihood estimator for Pareto parameters of the circuit build time
distribution, instead of a "right-censored estimator". This causes
clients to ignore circuits that never finish building in their timeout
calculations. Previously, clients were counting such unfinished circuits
as having the highest possible build time value, when in reality these
circuits most likely just contain relays that are offline. We also now
wait a bit longer to let circuits complete for measurement purposes,
lower the minimum possible effective timeout from 1.5 seconds to 10ms,
and increase the resolution of the circuit build time histogram from
50ms bin widths to 10ms bin widths. Additionally, we alter our estimate
Xm by taking the maximum of the top 10 most common build time values
of the 10ms histogram, and compute Xm as the average of these.
Fixes bug 40168; bugfix on 0.2.2.14-alpha.
168 changes: 53 additions & 115 deletions src/core/or/circuitstats.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,10 @@ circuit_build_times_max_timeouts(void)
* Retrieve and bounds-check the cbtnummodes consensus parameter.
*
* Effect: This value governs how many modes to use in the weighted
* average calculation of Pareto parameter Xm. A value of 3 introduces
* some bias (2-5% of CDF) under ideal conditions, but allows for better
* performance in the event that a client chooses guard nodes of radically
* different performance characteristics.
* average calculation of Pareto parameter Xm. Analysis of pairs of
* geographically near, far, and mixed guaeds has shown that a value of
* 10 introduces some allows for the actual timeout rate to be within
* 2-7% of the cutoff quantile, for quantiles between 60-80%.
*/
static int32_t
circuit_build_times_default_num_xm_modes(void)
Expand Down Expand Up @@ -853,74 +853,63 @@ circuit_build_times_create_histogram(const circuit_build_times_t *cbt,
* Return the Pareto start-of-curve parameter Xm.
*
* Because we are not a true Pareto curve, we compute this as the
* weighted average of the N most frequent build time bins. N is either
* 1 if we don't have enough circuit build time data collected, or
* determined by the consensus parameter cbtnummodes (default 3).
* weighted average of the 10 most frequent build time bins. This
* heuristic allowed for the actual timeout rate to be closest
* to the chosen quantile cutoff, for quantiles 60-80%, out of
* many variant approaches (see #40157 for analysis).
*/
static build_time_t
STATIC build_time_t
circuit_build_times_get_xm(circuit_build_times_t *cbt)
{
build_time_t i, nbins;
build_time_t nbins = 0;
build_time_t *nth_max_bin;
int32_t bin_counts=0;
build_time_t ret = 0;
uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins);
int n=0;
build_time_t xm_total = 0;
build_time_t Xm = 0;
int32_t xm_counts=0;
int num_modes = circuit_build_times_default_num_xm_modes();
uint32_t *histogram = circuit_build_times_create_histogram(cbt, &nbins);

tor_assert(nbins > 0);
tor_assert(num_modes > 0);

// Only use one mode if < 1000 buildtimes. Not enough data
// for multiple.
if (cbt->total_build_times < CBT_NCIRCUITS_TO_OBSERVE)
num_modes = 1;

nth_max_bin = tor_calloc(num_modes, sizeof(build_time_t));

/* Determine the N most common build times */
for (i = 0; i < nbins; i++) {
if (histogram[i] >= histogram[nth_max_bin[0]]) {
nth_max_bin[0] = i;
}

for (n = 1; n < num_modes; n++) {
if (histogram[i] >= histogram[nth_max_bin[n]] &&
(!histogram[nth_max_bin[n-1]]
|| histogram[i] < histogram[nth_max_bin[n-1]])) {
/* Determine the N most common build times, by selecting the
* nth largest mode, counting it, and removing it from the histogram. */
for (int n = 0; n < num_modes; n++) {
/* Get nth mode */
for (build_time_t i = 0; i < nbins; i++) {
if (histogram[i] > histogram[nth_max_bin[n]]) {
nth_max_bin[n] = i;
}
}
}

for (n = 0; n < num_modes; n++) {
bin_counts += histogram[nth_max_bin[n]];
ret += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]];
log_info(LD_CIRC, "Xm mode #%d: %u %u", n, CBT_BIN_TO_MS(nth_max_bin[n]),
histogram[nth_max_bin[n]]);
/* Update average */
xm_counts += histogram[nth_max_bin[n]];
xm_total += CBT_BIN_TO_MS(nth_max_bin[n])*histogram[nth_max_bin[n]];

/* Prevent from re-counting this value */
histogram[nth_max_bin[n]] = 0;
}

/* bin_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE
/* xm_counts can become zero if all of our last CBT_NCIRCUITS_TO_OBSERVE
* circuits were abandoned before they completed. This shouldn't happen,
* though. We should have reset/re-learned a lower timeout first. */
if (bin_counts == 0) {
ret = 0;
if (xm_counts == 0) {
log_warn(LD_CIRC,
"No valid circuit build time data out of %d times, %u modes, "
"have_timeout=%d, %lfms", cbt->total_build_times, num_modes,
cbt->have_computed_timeout, cbt->timeout_ms);
goto done;
}

tor_assert(bin_counts > 0);

ret /= bin_counts;
Xm = xm_total / xm_counts;

done:
tor_free(histogram);
tor_free(nth_max_bin);

return ret;
return Xm;
}

/**
Expand Down Expand Up @@ -1009,43 +998,6 @@ circuit_build_times_shuffle_and_store_array(circuit_build_times_t *cbt,
}
}

/**
* Filter old synthetic timeouts that were created before the
* new right-censored Pareto calculation was deployed.
*
* Once all clients before 0.2.1.13-alpha are gone, this code
* will be unused.
*/
static int
circuit_build_times_filter_timeouts(circuit_build_times_t *cbt)
{
int num_filtered=0, i=0;
double timeout_rate = 0;
build_time_t max_timeout = 0;

timeout_rate = circuit_build_times_timeout_rate(cbt);
max_timeout = (build_time_t)cbt->close_ms;

for (i = 0; i < CBT_NCIRCUITS_TO_OBSERVE; i++) {
if (cbt->circuit_build_times[i] > max_timeout) {
build_time_t replaced = cbt->circuit_build_times[i];
num_filtered++;
cbt->circuit_build_times[i] = CBT_BUILD_ABANDONED;

log_debug(LD_CIRC, "Replaced timeout %d with %d", replaced,
cbt->circuit_build_times[i]);
}
}

log_info(LD_CIRC,
"We had %d timeouts out of %d build times, "
"and filtered %d above the max of %u",
(int)(cbt->total_build_times*timeout_rate),
cbt->total_build_times, num_filtered, max_timeout);

return num_filtered;
}

/**
* Load histogram from <b>state</b>, shuffling the resulting array
* after we do so. Use this result to estimate parameters and
Expand Down Expand Up @@ -1171,10 +1123,6 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt,

circuit_build_times_set_timeout(cbt);

if (!state->CircuitBuildAbandonedCount && cbt->total_build_times) {
circuit_build_times_filter_timeouts(cbt);
}

done:
tor_free(loaded_times);
return err ? -1 : 0;
Expand All @@ -1195,7 +1143,6 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt)
build_time_t *x=cbt->circuit_build_times;
double a = 0;
int n=0,i=0,abandoned_count=0;
build_time_t max_time=0;

/* http://en.wikipedia.org/wiki/Pareto_distribution#Parameter_estimation */
/* We sort of cheat here and make our samples slightly more pareto-like
Expand All @@ -1215,14 +1162,13 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt)

if (x[i] < cbt->Xm) {
a += tor_mathlog(cbt->Xm);
n++;
} else if (x[i] == CBT_BUILD_ABANDONED) {
abandoned_count++;
} else {
a += tor_mathlog(x[i]);
if (x[i] > max_time)
max_time = x[i];
n++;
}
n++;
}

/*
Expand All @@ -1231,30 +1177,23 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt)
* performs this same check, and resets state if it hits it. If we
* hit it at runtime, something serious has gone wrong.
*/
if (n!=cbt->total_build_times) {
if (n!=cbt->total_build_times-abandoned_count) {
log_err(LD_CIRC, "Discrepancy in build times count: %d vs %d", n,
cbt->total_build_times);
}
tor_assert(n==cbt->total_build_times);

if (max_time <= 0) {
/* This can happen if Xm is actually the *maximum* value in the set.
* It can also happen if we've abandoned every single circuit somehow.
* In either case, tell the caller not to compute a new build timeout. */
log_warn(LD_BUG,
"Could not determine largest build time (%d). "
"Xm is %dms and we've abandoned %d out of %d circuits.", max_time,
cbt->Xm, abandoned_count, n);
return 0;
}

a += abandoned_count*tor_mathlog(max_time);
tor_assert_nonfatal(n==cbt->total_build_times-abandoned_count);

/* This is the "Maximum Likelihood Estimator" for parameter alpha of a Pareto
* Distribution. See:
* https://en.wikipedia.org/wiki/Pareto_distribution#Estimation_of_parameters
*
* The division in the estimator is done with subtraction outside the ln(),
* with the sum occurring in the for loop above.
*
* This done is to avoid the precision issues of logs of small values.
*/
a -= n*tor_mathlog(cbt->Xm);
// Estimator comes from Eq #4 in:
// "Bayesian estimation based on trimmed samples from Pareto populations"
// by Arturo J. Fernández. We are right-censored only.
a = (n-abandoned_count)/a;
a = n/a;

cbt->alpha = a;

Expand Down Expand Up @@ -1662,9 +1601,8 @@ circuit_build_times_network_check_changed(circuit_build_times_t *cbt)

log_notice(LD_CIRC,
"Your network connection speed appears to have changed. Resetting "
"timeout to %lds after %d timeouts and %d buildtimes.",
tor_lround(cbt->timeout_ms/1000), timeout_count,
total_build_times);
"timeout to %ldms after %d timeouts and %d buildtimes.",
tor_lround(cbt->timeout_ms), timeout_count, total_build_times);

return 1;
}
Expand Down Expand Up @@ -1830,7 +1768,7 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt)
return;

if (cbt->timeout_ms < circuit_build_times_min_timeout()) {
log_info(LD_CIRC, "Set buildtimeout to low value %fms. Setting to %dms",
log_notice(LD_CIRC, "Set buildtimeout to low value %fms. Setting to %dms",
cbt->timeout_ms, circuit_build_times_min_timeout());
cbt->timeout_ms = circuit_build_times_min_timeout();
if (cbt->close_ms < cbt->timeout_ms) {
Expand All @@ -1848,9 +1786,9 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt)
log_info(LD_CIRC,
"Based on %d circuit times, it looks like we don't need to "
"wait so long for circuits to finish. We will now assume a "
"circuit is too slow to use after waiting %ld seconds.",
"circuit is too slow to use after waiting %ld milliseconds.",
cbt->total_build_times,
tor_lround(cbt->timeout_ms/1000));
tor_lround(cbt->timeout_ms));
log_info(LD_CIRC,
"Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f",
cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha,
Expand All @@ -1859,18 +1797,18 @@ circuit_build_times_set_timeout(circuit_build_times_t *cbt)
log_info(LD_CIRC,
"Based on %d circuit times, it looks like we need to wait "
"longer for circuits to finish. We will now assume a "
"circuit is too slow to use after waiting %ld seconds.",
"circuit is too slow to use after waiting %ld milliseconds.",
cbt->total_build_times,
tor_lround(cbt->timeout_ms/1000));
tor_lround(cbt->timeout_ms));
log_info(LD_CIRC,
"Circuit timeout data: %fms, %fms, Xm: %d, a: %f, r: %f",
cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha,
timeout_rate);
} else {
log_info(LD_CIRC,
"Set circuit build timeout to %lds (%fms, %fms, Xm: %d, a: %f,"
"Set circuit build timeout to %ldms (%fms, %fms, Xm: %d, a: %f,"
" r: %f) based on %d circuit times",
tor_lround(cbt->timeout_ms/1000),
tor_lround(cbt->timeout_ms),
cbt->timeout_ms, cbt->close_ms, cbt->Xm, cbt->alpha, timeout_rate,
cbt->total_build_times);
}
Expand Down
11 changes: 6 additions & 5 deletions src/core/or/circuitstats.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ void circuit_build_times_mark_circ_as_measurement_only(origin_circuit_t *circ);
#define CBT_NCIRCUITS_TO_OBSERVE 1000

/** Width of the histogram bins in milliseconds */
#define CBT_BIN_WIDTH ((build_time_t)50)
#define CBT_BIN_WIDTH ((build_time_t)10)

/** Number of modes to use in the weighted-avg computation of Xm */
#define CBT_DEFAULT_NUM_XM_MODES 3
#define CBT_DEFAULT_NUM_XM_MODES 10
#define CBT_MIN_NUM_XM_MODES 1
#define CBT_MAX_NUM_XM_MODES 20

Expand All @@ -78,7 +78,7 @@ void circuit_build_times_mark_circ_as_measurement_only(origin_circuit_t *circ);
* How long to wait before actually closing circuits that take too long to
* build in terms of CDF quantile.
*/
#define CBT_DEFAULT_CLOSE_QUANTILE 95
#define CBT_DEFAULT_CLOSE_QUANTILE 99
#define CBT_MIN_CLOSE_QUANTILE CBT_MIN_QUANTILE_CUTOFF
#define CBT_MAX_CLOSE_QUANTILE CBT_MAX_QUANTILE_CUTOFF

Expand Down Expand Up @@ -119,8 +119,8 @@ double circuit_build_times_quantile_cutoff(void);
#define CBT_MAX_TEST_FREQUENCY INT32_MAX

/** Lowest allowable value for CircuitBuildTimeout in milliseconds */
#define CBT_DEFAULT_TIMEOUT_MIN_VALUE (1500)
#define CBT_MIN_TIMEOUT_MIN_VALUE 500
#define CBT_DEFAULT_TIMEOUT_MIN_VALUE (CBT_BIN_WIDTH)
#define CBT_MIN_TIMEOUT_MIN_VALUE CBT_BIN_WIDTH
#define CBT_MAX_TIMEOUT_MIN_VALUE INT32_MAX

/** Initial circuit build timeout in milliseconds */
Expand All @@ -142,6 +142,7 @@ STATIC void circuit_build_times_reset(circuit_build_times_t *cbt);
/* Network liveness functions */
STATIC int circuit_build_times_network_check_changed(
circuit_build_times_t *cbt);
STATIC build_time_t circuit_build_times_get_xm(circuit_build_times_t *cbt);
#endif /* defined(CIRCUITSTATS_PRIVATE) */

#ifdef TOR_UNIT_TESTS
Expand Down
Loading