Skip to content

Commit

Permalink
Merge pull request #57 from dynatrace-oss/martingale-estimation
Browse files Browse the repository at this point in the history
Martingale estimation
  • Loading branch information
oertl authored Nov 27, 2022
2 parents 6639b99 + 29e1088 commit 24ac6a4
Show file tree
Hide file tree
Showing 14 changed files with 806 additions and 283 deletions.
111 changes: 111 additions & 0 deletions src/main/java/com/dynatrace/hash4j/distinctcount/DistinctCounter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Copyright 2022 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.distinctcount;

// package-private interface to unify method names and to simplify testing
interface DistinctCounter<T extends DistinctCounter<T>> {

/**
* Adds a new element represented by a 64-bit hash value to this sketch.
*
* <p>In order to get good estimates, it is important that the hash value is calculated using a
* high-quality hash algorithm.
*
* @param hashValue a 64-bit hash value
* @return this sketch
*/
T add(long hashValue);

/**
* Adds another sketch.
*
* <p>The precision parameter of the added sketch must not be smaller than the precision parameter
* of this sketch. Otherwise, an {@link IllegalArgumentException} will be thrown.
*
* @param other the other sketch
* @return this sketch
* @throws NullPointerException if the argument is null
*/
T add(T other);

/**
* Returns an estimate of the number of distinct elements added to this sketch.
*
* @return estimated number of distinct elements
*/
double getDistinctCountEstimate();

/**
* Creates a copy of this sketch.
*
* @return the copy
*/
T copy();

/**
* Returns a downsized copy of this sketch with a precision that is not larger than the given
* precision parameter.
*
* @param p the precision parameter used for downsizing
* @return the downsized copy
* @throws IllegalArgumentException if the precision parameter is invalid
*/
T downsize(int p);

/**
* Resets this sketch to its initial state representing an empty set.
*
* @return this sketch
*/
T reset();

/**
* Returns a reference to the internal state of this sketch.
*
* <p>The returned state is never {@code null}.
*
* @return the internal state of this sketch
*/
byte[] getState();

/**
* Returns the precision parameter of this sketch.
*
* @return the precision parameter
*/
int getP();

/**
* Adds a new element represented by a 64-bit hash value to this sketch and passes, if the
* internal state has changed, decrements of the state change probability to the given {@link
* StateChangeObserver}.
*
* <p>In order to get good estimates, it is important that the hash value is calculated using a
* high-quality hash algorithm.
*
* @param hashValue a 64-bit hash value
* @param stateChangeObserver a state change observer
* @return this sketch
*/
T add(long hashValue, StateChangeObserver stateChangeObserver);

/**
* Returns the probability of an internal state change when a new distinct element is added.
*
* @return the state change probability
*/
double getStateChangeProbability();
}
84 changes: 59 additions & 25 deletions src/main/java/com/dynatrace/hash4j/distinctcount/HyperLogLog.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,10 @@
* 16th International Conference on Extending Database Technology. 2013.
* </ul>
*/
public final class HyperLogLog {
public final class HyperLogLog implements DistinctCounter<HyperLogLog> {

private static final PackedArrayHandler ARRAY_HANDLER = PackedArray.getHandler(6);

// visible for testing
static final double VARIANCE_FACTOR = 1.0794415416798357;

private static final double[] ESTIMATION_FACTORS = getEstimationFactors();

/**
Expand Down Expand Up @@ -182,22 +179,24 @@ private static int mul4DivideBy3(int x) {
}

/**
* Creates a copy of this {@link HyperLogLog} sketch.
* Creates a copy of this sketch.
*
* @return the copy
*/
@Override
public HyperLogLog copy() {
return new HyperLogLog(Arrays.copyOf(state, state.length), p);
}

/**
* Returns a downsized copy of this {@link HyperLogLog} sketch with a precision that is not larger
* than the given precision parameter.
* Returns a downsized copy of this sketch with a precision that is not larger than the given
* precision parameter.
*
* @param p the precision parameter used for downsizing
* @return the downsized copy
* @throws IllegalArgumentException if the precision parameter is invalid
*/
@Override
public HyperLogLog downsize(int p) {
checkPrecisionParameter(p, MIN_P, MAX_P);
if (p >= this.p) {
Expand Down Expand Up @@ -228,12 +227,13 @@ public static HyperLogLog merge(HyperLogLog sketch1, HyperLogLog sketch2) {
}

/**
* Returns a reference to the internal state of this {@link HyperLogLog} sketch.
* Returns a reference to the internal state of this sketch.
*
* <p>The returned state is never {@code null}.
*
* @return the internal state of this sketch
*/
@Override
public byte[] getState() {
return state;
}
Expand All @@ -243,6 +243,7 @@ public byte[] getState() {
*
* @return the precision parameter
*/
@Override
public int getP() {
return p;
}
Expand All @@ -261,15 +262,43 @@ static int calculateP(int stateLength) {
* @param hashValue a 64-bit hash value
* @return this sketch
*/
@Override
public HyperLogLog add(long hashValue) {
add(hashValue, null);
return this;
}

/**
* Adds a new element represented by a 64-bit hash value to this sketch and passes, if the
* internal state has changed, decrements of the state change probability to the given {@link
* StateChangeObserver}.
*
* <p>In order to get good estimates, it is important that the hash value is calculated using a
* high-quality hash algorithm.
*
* @param hashValue a 64-bit hash value
* @param stateChangeObserver a state change observer
* @return this sketch
*/
@Override
public HyperLogLog add(long hashValue, StateChangeObserver stateChangeObserver) {
int idx = (int) (hashValue >>> (-p));
long nlz = Long.numberOfLeadingZeros(~(~hashValue << p)); // nlz in {0, 1, ..., 64-p}
ARRAY_HANDLER.update(state, idx, nlz + 1, Math::max);
long newValue = Long.numberOfLeadingZeros(~(~hashValue << p)) + 1;
long oldValue = ARRAY_HANDLER.update(state, idx, newValue, Math::max);
if (stateChangeObserver != null && newValue > oldValue) {
double stateChangeProbabilityDecrement =
getRegisterChangeProbability(oldValue, p) - getRegisterChangeProbability(newValue, p);
stateChangeObserver.stateChanged(stateChangeProbabilityDecrement);
}
return this;
}

private static double getRegisterChangeProbability(long registerValue, int p) {
return Double.longBitsToDouble(0x3FF0000000000000L - ((registerValue + p) << 52));
}

/**
* Adds another {@link HyperLogLog} sketch.
* Adds another sketch.
*
* <p>The precision parameter of the added sketch must not be smaller than the precision parameter
* of this sketch. Otherwise, an {@link IllegalArgumentException} will be thrown.
Expand All @@ -278,6 +307,7 @@ public HyperLogLog add(long hashValue) {
* @return this sketch
* @throws NullPointerException if the argument is null
*/
@Override
public HyperLogLog add(HyperLogLog other) {
requireNonNull(other, "null argument");
byte[] otherData = other.state;
Expand Down Expand Up @@ -316,6 +346,7 @@ public HyperLogLog add(HyperLogLog other) {
*
* @return estimated number of distinct elements
*/
@Override
public double getDistinctCountEstimate() {
int c0 = 0;
double sum = 0;
Expand Down Expand Up @@ -357,29 +388,32 @@ static double sigma(double x) {
}

/**
* Visible for testing.
*
* <p>Returns the theoretical asymptotic (for large p and as the distinct count goes to infinity)
* relative standard error of the distinct count estimate for a given precision parameter.
*
* <p>For small cardinalities (up to the order of {@code 2^p} where {@code p} is the precision
* parameter, the relative error is usually less than this theoretical error.
* Returns the probability of an internal state change when a new distinct element is added.
*
* <p>The empirical root-mean square error might be slightly greater than this theoretical error,
* especially for small precision parameters.
*
* @param p the precision parameter
* @return the relative standard error
* @return the state change probability
*/
static double calculateTheoreticalRelativeStandardError(int p) {
return Math.sqrt(VARIANCE_FACTOR / (1 << p));
@Override
public double getStateChangeProbability() {
double sum = 0;
for (int off = 0; off + 2 < state.length; off += 3) {
long r0 = state[off] & 0x3fL;
long r1 = ((state[off] & 0xc0L) >>> 6) | ((state[off + 1] & 0x0fL) << 2);
long r2 = ((state[off + 1] & 0xf0L) >>> 4) | ((state[off + 2] & 0x03L) << 4);
long r3 = (state[off + 2] & 0xfcL) >>> 2;
sum += getRegisterChangeProbability(r0, p);
sum += getRegisterChangeProbability(r1, p);
sum += getRegisterChangeProbability(r2, p);
sum += getRegisterChangeProbability(r3, p);
}
return sum;
}

/**
* Resets this sketch to its initial state representing an empty set.
*
* @return this sketch
*/
@Override
public HyperLogLog reset() {
ARRAY_HANDLER.clear(state);
return this;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Copyright 2022 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.dynatrace.hash4j.distinctcount;

import static com.dynatrace.hash4j.util.Preconditions.checkArgument;

/**
* A martingale estimator, that can be used in conjunction with a distinct counter such as {@link
* HyperLogLog} or {@link UltraLogLog} to obtain slightly more accurate estimates for
* non-distributed data streams than the corresponding standard estimators. For distributed data
* streams, which require merging of partial results into a final result, the use of this martingale
* estimator is not very useful and therefore not recommended.
*
* <p>In order to get correct estimates, this estimator must be updated with every single add
* operation of the corresponding distinct count data structure using {@link HyperLogLog#add(long,
* StateChangeObserver)} and {@link UltraLogLog#add(long, StateChangeObserver)}, respectively.
*
* <p>The estimator will become invalid, if the associated data structure is modified using
* non-element addition operations such as {@link HyperLogLog#add(HyperLogLog)} or {@link
* UltraLogLog#add(UltraLogLog)}. It is possible to initiate a new martingale estimator using the
* values returned by {@code getStateChangeProbability()} and {@code getDistinctCountEstimate()} of
* the corresponding distinct counter. At that point, the martingale estimator returns the same
* estimate as the standard estimator. However, if many further elements are added, the martingale
* estimator may again produce better estimates.
*
* <p>References:
*
* <ul>
* <li>Ting, Daniel. "Streamed approximate counting of distinct elements: Beating optimal batch
* methods." Proceedings of the 20th ACM SIGKDD international conference on Knowledge
* discovery and data mining. 2014.
* <li>Cohen, Edith. "All-distances sketches, revisited: HIP estimators for massive graphs
* analysis." Proceedings of the 33rd ACM SIGMOD-SIGACT-SIGART symposium on Principles of
* database systems. 2014.
* <li>Pettie, Seth, Dingyu Wang, and Longhui Yin. "Non-mergeable sketching for cardinality
* estimation." arXiv preprint arXiv:2008.08739 (2020).
* </ul>
*/
public final class MartingaleEstimator implements StateChangeObserver {

private double distinctCountEstimate;
private double stateChangeProbability;

public MartingaleEstimator(double distinctCountEstimate, double stateChangeProbability) {
checkArgument(
distinctCountEstimate >= 0, "Initial distinct count estimate must be non-negative!");
checkArgument(
stateChangeProbability >= 0 && stateChangeProbability <= 1,
"Initial state change probability must be in the range [0,1]!");
this.distinctCountEstimate = distinctCountEstimate;
if (stateChangeProbability
<= 0) { // if state change probability == -0.0 set it to +0.0, to avoid negative infinite
// estimates
stateChangeProbability = 0.;
}
this.stateChangeProbability = stateChangeProbability;
}

public MartingaleEstimator() {
this.distinctCountEstimate = 0;
this.stateChangeProbability = 1;
}

/**
* Returns the distinct count estimate.
*
* @return the distinct count estimate
*/
public double getDistinctCountEstimate() {
return distinctCountEstimate;
}

// visible for testing
double getStateChangeProbability() {
return stateChangeProbability;
}

@Override
public void stateChanged(double probabilityDecrement) {
distinctCountEstimate += 1. / stateChangeProbability;
stateChangeProbability -= probabilityDecrement;
if (stateChangeProbability <= 0) { // numerical errors could lead to negative probability
stateChangeProbability =
0; // set to zero in this case => next state change will set estimate = infinite
}
}

@Override
public String toString() {
return MartingaleEstimator.class.getSimpleName()
+ "{"
+ "distinctCountEstimate="
+ distinctCountEstimate
+ ", stateChangeProbability="
+ stateChangeProbability
+ '}';
}
}
Loading

0 comments on commit 24ac6a4

Please sign in to comment.