-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Improve overlap percent estimation for low-density ranges in StatisticRange #27570
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ public class StatisticRange | |
| { | ||
| private static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; | ||
| private static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; | ||
| private static final double DENSITY_HEURISTIC_THRESHOLD = 1e-3; | ||
|
|
||
| // TODO unify field and method names with SymbolStatsEstimate | ||
| /** | ||
|
|
@@ -122,7 +123,17 @@ public double overlapPercentWith(StatisticRange other) | |
| if (isInfinite(length()) && isFinite(lengthOfIntersect)) { | ||
| return INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; | ||
| } | ||
|
|
||
| if (lengthOfIntersect > 0) { | ||
| double thisDensity = this.distinctValues / length(); | ||
| double otherDensity = other.distinctValues / other.length(); | ||
| double minDensity = minExcludeNaN(thisDensity, otherDensity); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| if (!isNaN(thisDensity) && !isNaN(otherDensity) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| && isFinite(length()) && isFinite(other.length()) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we check that the lengths are finite ? |
||
| && minDensity < DENSITY_HEURISTIC_THRESHOLD) { | ||
| return minExcludeNaN(this.distinctValues, other.distinctValues) / this.distinctValues; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this be
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that we cannot use: Instead, we should use: Also, I looked more carefully at the idea of removing the
Comment on lines
+132
to
+135
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment explaining why this particular logic is here. |
||
| } | ||
| return lengthOfIntersect / length(); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ | |
| import static java.lang.Double.NaN; | ||
| import static java.lang.Double.POSITIVE_INFINITY; | ||
| import static org.assertj.core.api.Assertions.assertThat; | ||
| import static org.assertj.core.api.AssertionsForClassTypes.within; | ||
|
|
||
| public class TestStatisticRange | ||
| { | ||
|
|
@@ -59,6 +60,55 @@ public void testOverlapPercentWith() | |
| assertOverlap(unboundedRange(0.0), unboundedRange(0), 0); | ||
| } | ||
|
|
||
| @Test | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i love unit tests... except they actually don't really tell the story When it's not obvious why we expect given result, then it's not clear to do what to do when a test fails on a change. Oftentimes, updating the test is there's needed (formula changed ⇒ result change ⇒ test expected value changed). |
||
| public void testLowDensityOverlap() | ||
| { | ||
| StatisticRange sparseRange = range(1, 3662098119.0, 14); | ||
| StatisticRange filterRange = range(1, 4, 4); | ||
|
|
||
| double expectedOverlap = 4.0 / 14.0; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. inline? |
||
| assertOverlap(sparseRange, filterRange, expectedOverlap); | ||
| } | ||
|
|
||
| @Test | ||
| public void testDensityThresholdBoundary() | ||
| { | ||
| StatisticRange boundaryRange = range(0, 10000, 10); | ||
| StatisticRange smallFilter = range(0, 100, 5); | ||
|
|
||
| double overlap = boundaryRange.overlapPercentWith(smallFilter); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd inline this variable and add line break before |
||
| assertThat(overlap).isBetween(0.01, 0.5); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's very wide |
||
| } | ||
|
|
||
| @Test | ||
| public void testHighDensityOverlap() | ||
| { | ||
| StatisticRange denseRange = range(0, 100, 50); | ||
| StatisticRange filterRange = range(20, 30, 5); | ||
|
|
||
| assertOverlap(denseRange, filterRange, 0.1); | ||
| } | ||
|
|
||
| @Test | ||
| public void testVeryLowDensity() | ||
| { | ||
| StatisticRange verySparse = range(0, 1e9, 10); | ||
| StatisticRange filterRange = range(100, 200, 5); | ||
|
|
||
| double expected = 5.0 / 10.0; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. inline? |
||
| double actual = verySparse.overlapPercentWith(filterRange); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd inline this variable and add line break before |
||
| assertThat(actual).isCloseTo(expected, within(0.1)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 0.1 is a lot error margin given the expected is 0.5. |
||
| } | ||
|
|
||
| @Test | ||
| public void testDensityWithZeroDistinctValues() | ||
| { | ||
| StatisticRange zeroDistinct = range(0, 1000, 0); | ||
| StatisticRange filterRange = range(100, 200, 5); | ||
|
|
||
| assertOverlap(zeroDistinct, filterRange, 0); | ||
| } | ||
|
|
||
| @Test | ||
| public void testIntersect() | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a code comment explaining this section