From 0ef8f41a3756b7a100eb8449051fd589ca27c22a Mon Sep 17 00:00:00 2001 From: Murphy Date: Thu, 17 Oct 2024 18:53:09 +0800 Subject: [PATCH 1/9] tmp Signed-off-by: Murphy --- .../com/starrocks/qe/SessionVariable.java | 8 +++ .../sql/optimizer/statistics/Bucket.java | 4 ++ .../sql/optimizer/statistics/Histogram.java | 4 ++ .../statistics/HistogramEstimator.java | 57 +++++++++++++++++++ .../statistics/StatisticRangeValues.java | 13 +++++ 5 files changed, 86 insertions(+) create mode 100644 fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java diff --git a/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java b/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java index 7e447f85fbd3c..e9fdf79798fde 100644 --- a/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java @@ -356,6 +356,7 @@ public class SessionVariable implements Serializable, Writable, Cloneable { public static final String CBO_MAX_REORDER_NODE = "cbo_max_reorder_node"; public static final String CBO_PRUNE_SHUFFLE_COLUMN_RATE = "cbo_prune_shuffle_column_rate"; public static final String CBO_PUSH_DOWN_AGGREGATE_MODE = "cbo_push_down_aggregate_mode"; + public static final String CBO_ENABLE_HISTOGRAM_JOIN_ESTIMATION = "cbo_enable_histogram_join_estimation"; public static final String CBO_PUSH_DOWN_DISTINCT_BELOW_WINDOW = "cbo_push_down_distinct_below_window"; public static final String CBO_PUSH_DOWN_AGGREGATE = "cbo_push_down_aggregate"; @@ -1490,6 +1491,9 @@ public static MaterializedViewRewriteMode parse(String str) { @VarAttr(name = CBO_PUSH_DOWN_GROUPINGSET_RESHUFFLE, flag = VariableMgr.INVISIBLE) private boolean cboPushDownGroupingSetReshuffle = true; + @VarAttr(name = CBO_ENABLE_HISTOGRAM_JOIN_ESTIMATION, flag = VariableMgr.INVISIBLE) + private boolean cboEnableHistogramJoinEstimation = true; + @VariableMgr.VarAttr(name = PARSE_TOKENS_LIMIT) private int parseTokensLimit = 3500000; @@ -3479,6 +3483,10 @@ public void setCboPushDownDistinctBelowWindow(boolean flag) { this.cboPushDownDistinctBelowWindow = flag; } + public boolean isCboEnableHistogramJoinEstimation() { + return cboEnableHistogramJoinEstimation; + } + public boolean isCboPushDownDistinctBelowWindow() { return this.cboPushDownDistinctBelowWindow; } diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java index 80ad8f28a96e1..7102889ada695 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java @@ -43,4 +43,8 @@ public Long getCount() { public Long getUpperRepeats() { return upperRepeats; } + + public boolean isOverlapped(Bucket other) { + return Math.min(upper, other.upper) - Math.max(lower, other.lower) > 0; + } } \ No newline at end of file diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java index 76596a8c9febc..57ca06359e1f3 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java @@ -63,6 +63,10 @@ public String getMcvString() { return sb.toString(); } + public List getOverlapped(Bucket bucket) { + + } + public Optional getRowCountInBucket(ConstantOperator constantOperator, double distinctValuesCount) { Optional valueOpt = StatisticUtils.convertStatisticsToDouble(constantOperator.getType(), constantOperator.toString()); diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java new file mode 100644 index 0000000000000..4b9ce1194b658 --- /dev/null +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java @@ -0,0 +1,57 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.starrocks.sql.optimizer.statistics; + +import com.starrocks.qe.ConnectContext; + +import java.util.Collections; +import java.util.List; + +/** + * Use histogram to estimate cardinality + */ +public class HistogramEstimator { + + /** + * Return null if failed to estimate + */ + public static Double estimateEqualToSelectivity(ColumnStatistic left, ColumnStatistic right) { + ConnectContext context = ConnectContext.get(); + if (context != null && !context.getSessionVariable().isCboEnableHistogramJoinEstimation()) { + return null; + } + if (left.getHistogram() == null || right.getHistogram() == null) { + return null; + } + + Histogram lhs = left.getHistogram(); + Histogram rhs = right.getHistogram(); + for (Bucket bucket : lhs.getBuckets()) { + Collections.binarySearch(rhs.getBuckets(), new Bucket(1, 1, 1, 1)); + List overlapped = rhs.getOverlapped(bucket); + long overlapCount = 0; + for (Bucket overlap : overlapped) { + StatisticRangeValues leftRange = + new StatisticRangeValues(bucket.getLower(), bucket.getUpper(), bucket.getUpperRepeats()); + StatisticRangeValues rightRange = + new StatisticRangeValues(overlap.getLower(), overlap.getUpper(), bucket.getUpperRepeats()); + double overlapLength = leftRange.overlapLength(rightRange); + overlapCount += overlapLength; + } + return 1.0 * overlapCount / lhs.getTotalRows(); + } + } + +} diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java index 18ca49e6eea97..9bf30cffa1cf7 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java @@ -111,6 +111,19 @@ public double overlapPercentWith(@NotNull StatisticRangeValues other) { return StatisticsEstimateCoefficient.OVERLAP_INFINITE_RANGE_FILTER_COEFFICIENT; } + double overlapLength(StatisticRangeValues other) { + if (this.isEmpty() || other.isEmpty()) { + return 0.0; + } + // If the low and high values is infinite, it represents either string type or unknown of column statistics. + if (this.equals(other) && !isBothInfinite()) { + return 1.0; + } + + double lengthOfIntersect = min(this.high, other.high) - max(this.low, other.low); + return lengthOfIntersect; + } + public StatisticRangeValues intersect(StatisticRangeValues other) { double newLow = max(low, other.low); double newHigh = min(high, other.high); From d790d982dabf23b28c8a9a5775fc3a7350c5601d Mon Sep 17 00:00:00 2001 From: Murphy Date: Fri, 18 Oct 2024 10:49:43 +0800 Subject: [PATCH 2/9] histogram selectivity Signed-off-by: Murphy --- .../sql/optimizer/statistics/Histogram.java | 25 +++++- .../statistics/HistogramEstimator.java | 81 +++++++++++++------ .../statistics/HistogramEstimatorTest.java | 73 +++++++++++++++++ 3 files changed, 151 insertions(+), 28 deletions(-) create mode 100644 fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java index 57ca06359e1f3..4c448a884fefe 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java @@ -14,6 +14,8 @@ package com.starrocks.sql.optimizer.statistics; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import com.starrocks.sql.optimizer.operator.scalar.ConstantOperator; import com.starrocks.statistic.StatisticUtils; @@ -63,10 +65,6 @@ public String getMcvString() { return sb.toString(); } - public List getOverlapped(Bucket bucket) { - - } - public Optional getRowCountInBucket(ConstantOperator constantOperator, double distinctValuesCount) { Optional valueOpt = StatisticUtils.convertStatisticsToDouble(constantOperator.getType(), constantOperator.toString()); @@ -109,4 +107,23 @@ public Optional getRowCountInBucket(ConstantOperator constantOperator, dou return Optional.empty(); } + + static class Builder { + private final List buckets = Lists.newArrayList(); + private final Map mcv = Maps.newHashMap(); + + public Builder addBucket(Bucket bucket) { + this.buckets.add(bucket); + return this; + } + + public Builder addCommonValue(String key, Long count) { + this.mcv.put(key, count); + return this; + } + + public Histogram build() { + return new Histogram(buckets, mcv); + } + } } diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java index 4b9ce1194b658..7686aca5555f3 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java @@ -14,44 +14,77 @@ package com.starrocks.sql.optimizer.statistics; -import com.starrocks.qe.ConnectContext; - -import java.util.Collections; -import java.util.List; - /** * Use histogram to estimate cardinality */ public class HistogramEstimator { - + /** - * Return null if failed to estimate + * Estimate the selectivity of two columns with EqualTo operator + * Return null if fail to do the estimation */ public static Double estimateEqualToSelectivity(ColumnStatistic left, ColumnStatistic right) { - ConnectContext context = ConnectContext.get(); - if (context != null && !context.getSessionVariable().isCboEnableHistogramJoinEstimation()) { + // Check if input parameters are valid + if (left == null || right == null) { return null; } - if (left.getHistogram() == null || right.getHistogram() == null) { + + // Get histograms + Histogram leftHistogram = left.getHistogram(); + Histogram rightHistogram = right.getHistogram(); + + // If either histogram is empty, estimation is not possible + if (leftHistogram == null || rightHistogram == null) { return null; } - Histogram lhs = left.getHistogram(); - Histogram rhs = right.getHistogram(); - for (Bucket bucket : lhs.getBuckets()) { - Collections.binarySearch(rhs.getBuckets(), new Bucket(1, 1, 1, 1)); - List overlapped = rhs.getOverlapped(bucket); - long overlapCount = 0; - for (Bucket overlap : overlapped) { - StatisticRangeValues leftRange = - new StatisticRangeValues(bucket.getLower(), bucket.getUpper(), bucket.getUpperRepeats()); - StatisticRangeValues rightRange = - new StatisticRangeValues(overlap.getLower(), overlap.getUpper(), bucket.getUpperRepeats()); - double overlapLength = leftRange.overlapLength(rightRange); - overlapCount += overlapLength; + // Calculate the overlapping area of the two histograms + double overlapArea = 0.0; + double totalArea = 0.0; + + for (Bucket leftBucket : leftHistogram.getBuckets()) { + for (Bucket rightBucket : rightHistogram.getBuckets()) { + double overlap = calculateBucketOverlap(leftBucket, rightBucket); + overlapArea += overlap; } - return 1.0 * overlapCount / lhs.getTotalRows(); + totalArea += leftBucket.getCount(); + } + + // Calculate selectivity + if (totalArea > 0) { + return overlapArea / totalArea; + } else { + return null; } } + private static double calculateBucketOverlap(Bucket leftBucket, Bucket rightBucket) { + double leftLower = leftBucket.getLower(); + double leftUpper = leftBucket.getUpper(); + double rightLower = rightBucket.getLower(); + double rightUpper = rightBucket.getUpper(); + + // Calculate overlap interval + double overlapLower = Math.max(leftLower, rightLower); + double overlapUpper = Math.min(leftUpper, rightUpper); + + // If there's no overlap, return 0 + if (overlapLower >= overlapUpper) { + return 0; + } + + // Calculate overlap ratio + double leftRange = leftUpper - leftLower; + double rightRange = rightUpper - rightLower; + double overlapRange = overlapUpper - overlapLower; + + double leftOverlapRatio = overlapRange / leftRange; + double rightOverlapRatio = overlapRange / rightRange; + + // Estimate the count of overlapping elements + double overlapCount = + Math.min(leftBucket.getCount() * leftOverlapRatio, rightBucket.getCount() * rightOverlapRatio); + + return overlapCount; + } } diff --git a/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java new file mode 100644 index 0000000000000..4c37a1368fd65 --- /dev/null +++ b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java @@ -0,0 +1,73 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.starrocks.sql.optimizer.statistics; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.stream.Stream; + + +public class HistogramEstimatorTest { + + @ParameterizedTest + @MethodSource("provideTestCases") + public void testEstimateEqualToSelectivity( + ColumnStatistic left, ColumnStatistic right, Double expectedSelectivity) { + Double actualSelectivity = HistogramEstimator.estimateEqualToSelectivity(left, right); + if (expectedSelectivity == null) { + Assertions.assertNull(actualSelectivity); + } else { + Assertions.assertNotNull(actualSelectivity); + Assertions.assertEquals(expectedSelectivity, actualSelectivity, 0.01); + } + } + + private static Stream provideTestCases() { + return Stream.of( + // Normal case: overlapping histograms + Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(new double[] {3, 7, 12}, new long[] {150, 250}), 0.5), + // Completely overlapping histograms + Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), 1.0), + // Non-overlapping histograms + Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(new double[] {15, 20, 25}, new long[] {150, 250}), 0.0), + // One empty histogram + Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(), null), + // Both empty histograms + Arguments.of(createColumnStatistic(), createColumnStatistic(), null), + // One null histogram + Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), null, null)); + } + + // create an empty column statistics + private static ColumnStatistic createColumnStatistic() { + return new ColumnStatistic(0, 0, 0, 0, 0, null, ColumnStatistic.StatisticType.ESTIMATE); + } + + private static ColumnStatistic createColumnStatistic(double[] bounds, long[] counts) { + Histogram.Builder builder = new Histogram.Builder(); + for (int i = 0; i < counts.length; i++) { + builder.addBucket(new Bucket(bounds[i], bounds[i + 1], counts[i], 0L)); + } + Histogram histogram = builder.build(); + return new ColumnStatistic(0, 0, 0, 0, 0, histogram, ColumnStatistic.StatisticType.ESTIMATE); + } +} From c4c934336bad83fc7982836b0f34471c2d1e4dd4 Mon Sep 17 00:00:00 2001 From: Murphy Date: Fri, 18 Oct 2024 13:10:18 +0800 Subject: [PATCH 3/9] add more test Signed-off-by: Murphy --- .../statistics/HistogramEstimator.java | 5 ++ .../statistics/HistogramEstimatorTest.java | 80 ++++++++++++++++--- 2 files changed, 75 insertions(+), 10 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java index 7686aca5555f3..7690cdb4a863c 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java @@ -14,6 +14,8 @@ package com.starrocks.sql.optimizer.statistics; +import com.google.common.base.Preconditions; + /** * Use histogram to estimate cardinality */ @@ -52,6 +54,9 @@ public static Double estimateEqualToSelectivity(ColumnStatistic left, ColumnStat // Calculate selectivity if (totalArea > 0) { + double selectivity = overlapArea / totalArea; + Preconditions.checkState(0.0 <= selectivity && selectivity <= 1.0, + "exceptional selectivity: " + selectivity); return overlapArea / totalArea; } else { return null; diff --git a/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java index 4c37a1368fd65..a2181901458c6 100644 --- a/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java @@ -21,9 +21,8 @@ import java.util.stream.Stream; - public class HistogramEstimatorTest { - + @ParameterizedTest @MethodSource("provideTestCases") public void testEstimateEqualToSelectivity( @@ -40,28 +39,89 @@ public void testEstimateEqualToSelectivity( private static Stream provideTestCases() { return Stream.of( // Normal case: overlapping histograms - Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), - createColumnStatistic(new double[] {3, 7, 12}, new long[] {150, 250}), 0.5), + Arguments.of( + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(new double[] {3, 7, 12}, new long[] {150, 250}), + 0.81), + Arguments.of( + createColumnStatistic(new double[] {1, 5, 10}, new long[] {1, 2}), + createColumnStatistic(new double[] {3, 7, 12}, new long[] {150, 250}), + 0.83), + Arguments.of( + createColumnStatistic(new double[] {3, 7, 12}, new long[] {150, 250}), + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + 0.61), + + // Normal case: diverse bucket + Arguments.of( + createColumnStatistic(new double[] {1, 100, 200, 300, 400}, new long[] {100, 200, 200, 400}), + createColumnStatistic(new double[] {1, 200, 400}, new long[] {150, 250}), + 0.44), + + // Normal case: lots of buckets, but the range is same + Arguments.of( + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 16)), + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 16)), + 1.0), + Arguments.of( + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 10)), + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 16)), + 1.0), + Arguments.of( + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 10)), + createColumnStatistic(createUniformedHistogram(800, 128, 1 << 16)), + 1.0), + Arguments.of( + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 10)), + createColumnStatistic(createUniformedHistogram(10, 10240, 1 << 16)), + 1.0), + // Completely overlapping histograms - Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), - createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), 1.0), + Arguments.of( + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + 1.0), + Arguments.of( + createColumnStatistic(new double[] {1, 5, 10}, new long[] {10, 20}), + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + 1.0), + // Non-overlapping histograms - Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), - createColumnStatistic(new double[] {15, 20, 25}, new long[] {150, 250}), 0.0), + Arguments.of( + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(new double[] {15, 20, 25}, new long[] {150, 250}), + 0.0), + // One empty histogram - Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), - createColumnStatistic(), null), + Arguments.of( + createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), + createColumnStatistic(), + null), // Both empty histograms Arguments.of(createColumnStatistic(), createColumnStatistic(), null), // One null histogram Arguments.of(createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), null, null)); } + private static Histogram createUniformedHistogram(int numBuckets, double bucketRange, long perBucketCount) { + Histogram.Builder builder = new Histogram.Builder(); + double lower = 0.0; + for (int i = 0; i < numBuckets; i++) { + builder.addBucket(new Bucket(lower, lower + bucketRange, perBucketCount, 1L)); + lower += bucketRange; + } + return builder.build(); + } + // create an empty column statistics private static ColumnStatistic createColumnStatistic() { return new ColumnStatistic(0, 0, 0, 0, 0, null, ColumnStatistic.StatisticType.ESTIMATE); } + private static ColumnStatistic createColumnStatistic(Histogram hist) { + return new ColumnStatistic(0, 0, 0, 0, 0, hist, ColumnStatistic.StatisticType.ESTIMATE); + } + private static ColumnStatistic createColumnStatistic(double[] bounds, long[] counts) { Histogram.Builder builder = new Histogram.Builder(); for (int i = 0; i < counts.length; i++) { From 6604eed9fe7a02b30005067a74342aed34b23e74 Mon Sep 17 00:00:00 2001 From: Murphy Date: Fri, 18 Oct 2024 16:00:26 +0800 Subject: [PATCH 4/9] calculate predicate selectivity Signed-off-by: Murphy --- .../BinaryPredicateStatisticCalculator.java | 37 ++++++++++++-- .../sql/optimizer/statistics/Bucket.java | 36 +++++++++++++- .../sql/optimizer/statistics/Histogram.java | 40 +++++++++++++-- .../statistics/HistogramEstimator.java | 3 +- .../statistics/HistogramEstimatorTest.java | 49 ++++++++++++++++++- 5 files changed, 155 insertions(+), 10 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/BinaryPredicateStatisticCalculator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/BinaryPredicateStatisticCalculator.java index e0eb1b198c045..1bb5fbfc549c2 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/BinaryPredicateStatisticCalculator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/BinaryPredicateStatisticCalculator.java @@ -16,6 +16,7 @@ package com.starrocks.sql.optimizer.statistics; import com.starrocks.analysis.BinaryType; +import com.starrocks.qe.ConnectContext; import com.starrocks.sql.optimizer.operator.scalar.BinaryPredicateOperator; import com.starrocks.sql.optimizer.operator.scalar.ColumnRefOperator; import com.starrocks.sql.optimizer.operator.scalar.ConstantOperator; @@ -295,15 +296,45 @@ public static Statistics estimateColumnToColumnComparison(ScalarOperator leftCol } } + /** + * Estimate selectivity based on domain contains assumption: + * selectivity = 1/max{NDV} + * It's not robust if the NDV is distorted, which usually lead to underestimated selectivity + */ + private static double estimateSelectivityWithNDV(ColumnStatistic leftColumnStatistic, + ColumnStatistic rightColumnStatistic) { + double leftDistinctValuesCount = leftColumnStatistic.getDistinctValuesCount(); + double rightDistinctValuesCount = rightColumnStatistic.getDistinctValuesCount(); + return 1.0 / Math.max(1, Math.max(leftDistinctValuesCount, rightDistinctValuesCount)); + } + + /** + * Estimate selectivity based on histogram: + * selectivity = sum{ overlap_area/total_area of all-buckets } + */ + private static Double estimateSelectivityWithHistogram(ColumnStatistic leftColumnStatistic, + ColumnStatistic rightColumnStatistic) { + ConnectContext context = ConnectContext.get(); + if (context == null || !context.getSessionVariable().isCboEnableHistogramJoinEstimation()) { + return null; + } + return HistogramEstimator.estimateEqualToSelectivity(leftColumnStatistic, rightColumnStatistic); + } + public static Statistics estimateColumnEqualToColumn(ScalarOperator leftColumn, ColumnStatistic leftColumnStatistic, ScalarOperator rightColumn, ColumnStatistic rightColumnStatistic, Statistics statistics, boolean isEqualForNull) { - double leftDistinctValuesCount = leftColumnStatistic.getDistinctValuesCount(); - double rightDistinctValuesCount = rightColumnStatistic.getDistinctValuesCount(); - double selectivity = 1.0 / Math.max(1, Math.max(leftDistinctValuesCount, rightDistinctValuesCount)); + double selectivity; + Double histogramSelectivity = estimateSelectivityWithHistogram(leftColumnStatistic, rightColumnStatistic); + if (histogramSelectivity != null) { + selectivity = histogramSelectivity; + } else { + selectivity = estimateSelectivityWithNDV(leftColumnStatistic, rightColumnStatistic); + } + double rowCount = statistics.getOutputRowCount() * selectivity * (isEqualForNull ? 1 : (1 - leftColumnStatistic.getNullsFraction()) * (1 - rightColumnStatistic.getNullsFraction())); diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java index 7102889ada695..8c3fba7690ece 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java @@ -15,7 +15,11 @@ package com.starrocks.sql.optimizer.statistics; -public class Bucket { +import org.jetbrains.annotations.NotNull; + +import java.util.Objects; + +public class Bucket implements Comparable { private final double lower; private final double upper; private final Long count; @@ -45,6 +49,34 @@ public Long getUpperRepeats() { } public boolean isOverlapped(Bucket other) { - return Math.min(upper, other.upper) - Math.max(lower, other.lower) > 0; + return Math.min(upper, other.upper) - Math.max(lower, other.lower) >= 0; + } + + @Override + public int compareTo(@NotNull Bucket o) { + int lowerComparison = Double.compare(this.lower, o.lower); + if (lowerComparison != 0) { + return lowerComparison; + } + return Double.compare(this.upper, o.upper); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Bucket bucket = (Bucket) o; + return Double.compare(lower, bucket.lower) == 0 && Double.compare(upper, bucket.upper) == 0 && + Objects.equals(count, bucket.count) && + Objects.equals(upperRepeats, bucket.upperRepeats); + } + + @Override + public int hashCode() { + return Objects.hash(lower, upper, count, upperRepeats); } } \ No newline at end of file diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java index 4c448a884fefe..c2acf06dc554e 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java @@ -60,11 +60,43 @@ public String getMcvString() { sb.append("MCV: ["); mcv.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder())) .limit(printMcvSize) - .forEach(entry -> sb.append("[").append(entry.getKey()).append(":").append(entry.getValue()).append("]")); + .forEach(entry -> sb.append("[").append(entry.getKey()).append(":").append(entry.getValue()) + .append("]")); sb.append("]"); return sb.toString(); } + /** + * Return overlapped buckets with the provided lower&upper bound + */ + public List getOverlappedBuckets(double lower, double upper) { + int left = 0; + int right = buckets.size() - 1; + while (left <= right) { + int mid = (left + right) / 2; + Bucket bucket = buckets.get(mid); + + // A potential bad case is lower & upper can cover most buckets, then this binary-search will fall back + // to an inefficient linear search. But it doesn't hold for the current histogram, because we only + // calculate the overlap within two buckets + if (bucket.getLower() <= upper && bucket.getUpper() >= lower) { + while (mid > 0 && buckets.get(mid - 1).getUpper() >= lower) { + mid--; + } + int endIndex = mid; + while (endIndex < buckets.size() - 1 && buckets.get(endIndex + 1).getLower() <= upper) { + endIndex++; + } + return buckets.subList(mid, endIndex + 1); + } else if (bucket.getUpper() < lower) { + left = mid + 1; + } else { + right = mid - 1; + } + } + return Lists.newArrayList(); + } + public Optional getRowCountInBucket(ConstantOperator constantOperator, double distinctValuesCount) { Optional valueOpt = StatisticUtils.convertStatisticsToDouble(constantOperator.getType(), constantOperator.toString()); @@ -88,9 +120,11 @@ public Optional getRowCountInBucket(ConstantOperator constantOperator, dou } if (constantOperator.getType().isFixedPointType()) { - rowCount = (long) Math.ceil(Math.max(1, rowCount / Math.max(1, (bucket.getUpper() - bucket.getLower())))); + rowCount = (long) Math.ceil( + Math.max(1, rowCount / Math.max(1, (bucket.getUpper() - bucket.getLower())))); } else { - rowCount = (long) Math.ceil(Math.max(1, rowCount / Math.max(1, distinctValuesCount / buckets.size()))); + rowCount = + (long) Math.ceil(Math.max(1, rowCount / Math.max(1, distinctValuesCount / buckets.size()))); } return Optional.of(rowCount); diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java index 7690cdb4a863c..a585c08c9c8a1 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java @@ -45,7 +45,8 @@ public static Double estimateEqualToSelectivity(ColumnStatistic left, ColumnStat double totalArea = 0.0; for (Bucket leftBucket : leftHistogram.getBuckets()) { - for (Bucket rightBucket : rightHistogram.getBuckets()) { + for (Bucket rightBucket : + rightHistogram.getOverlappedBuckets(leftBucket.getLower(), leftBucket.getUpper())) { double overlap = calculateBucketOverlap(leftBucket, rightBucket); overlapArea += overlap; } diff --git a/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java index a2181901458c6..d2f2897743380 100644 --- a/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java @@ -14,13 +14,20 @@ package com.starrocks.sql.optimizer.statistics; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import java.util.List; +import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.junit.jupiter.api.Assertions.assertEquals; + public class HistogramEstimatorTest { @ParameterizedTest @@ -32,7 +39,7 @@ public void testEstimateEqualToSelectivity( Assertions.assertNull(actualSelectivity); } else { Assertions.assertNotNull(actualSelectivity); - Assertions.assertEquals(expectedSelectivity, actualSelectivity, 0.01); + assertEquals(expectedSelectivity, actualSelectivity, 0.01); } } @@ -130,4 +137,44 @@ private static ColumnStatistic createColumnStatistic(double[] bounds, long[] cou Histogram histogram = builder.build(); return new ColumnStatistic(0, 0, 0, 0, 0, histogram, ColumnStatistic.StatisticType.ESTIMATE); } + + private List verifyBucketIndex(Histogram histogram, List buckets) { + return buckets.stream().map(x -> histogram.getBuckets().indexOf(x)).collect(Collectors.toList()); + } + + @Test + public void testGetOverlappedBuckets() { + Histogram histogram = new Histogram( + Lists.newArrayList( + new Bucket(0, 5, 100L, 0L), + new Bucket(5, 10, 200L, 0L), + new Bucket(10, 15, 300L, 0L), + new Bucket(15, 20, 400L, 0L) + ), + Maps.newHashMap() + ); + + // totally covered range + assertEquals(List.of(0, 1, 2, 3), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(0, 100))); + assertEquals(List.of(0, 1, 2, 3), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(2, 16))); + assertEquals(List.of(0, 1, 2, 3), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(0, 15))); + assertEquals(List.of(0, 1, 2, 3), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(5, 17))); + + // partially covered + assertEquals(List.of(1, 2, 3), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(10, 17))); + assertEquals(List.of(0, 1), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(1, 6))); + assertEquals(List.of(0, 1), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(0, 5))); + assertEquals(List.of(0, 1, 2), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(0, 10))); + assertEquals(List.of(2, 3), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(15, 20))); + + // boundary overlapped + assertEquals(List.of(0), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(-1, 0))); + assertEquals(List.of(3), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(20, 21))); + assertEquals(List.of(0, 1), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(5, 5))); + + // no overlap + assertEquals(List.of(), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(30, 100))); + assertEquals(List.of(), verifyBucketIndex(histogram, histogram.getOverlappedBuckets(-10, -1))); + } + } From 4ae16fbf88c0b63d4951ec81276783297513864c Mon Sep 17 00:00:00 2001 From: Murphy Date: Fri, 18 Oct 2024 16:12:24 +0800 Subject: [PATCH 5/9] use binary search to find overlapped buckets Signed-off-by: Murphy --- .../sql/optimizer/statistics/Histogram.java | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java index c2acf06dc554e..e4e5f9fb53d05 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java @@ -19,6 +19,7 @@ import com.starrocks.sql.optimizer.operator.scalar.ConstantOperator; import com.starrocks.statistic.StatisticUtils; +import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Map; @@ -66,35 +67,25 @@ public String getMcvString() { return sb.toString(); } - /** - * Return overlapped buckets with the provided lower&upper bound - */ public List getOverlappedBuckets(double lower, double upper) { - int left = 0; - int right = buckets.size() - 1; - while (left <= right) { - int mid = (left + right) / 2; - Bucket bucket = buckets.get(mid); + int startIndex = Collections.binarySearch(buckets, new Bucket(lower, lower, 0L, 0L), + Comparator.comparingDouble(Bucket::getUpper)); + if (startIndex < 0) { + startIndex = -startIndex - 1; + } - // A potential bad case is lower & upper can cover most buckets, then this binary-search will fall back - // to an inefficient linear search. But it doesn't hold for the current histogram, because we only - // calculate the overlap within two buckets - if (bucket.getLower() <= upper && bucket.getUpper() >= lower) { - while (mid > 0 && buckets.get(mid - 1).getUpper() >= lower) { - mid--; - } - int endIndex = mid; - while (endIndex < buckets.size() - 1 && buckets.get(endIndex + 1).getLower() <= upper) { - endIndex++; - } - return buckets.subList(mid, endIndex + 1); - } else if (bucket.getUpper() < lower) { - left = mid + 1; - } else { - right = mid - 1; - } + // Find the first bucket that overlaps with the upper bound + int endIndex = Collections.binarySearch(buckets, new Bucket(upper, upper, 0L, 0L), + Comparator.comparingDouble(Bucket::getLower)); + if (endIndex < 0) { + endIndex = -endIndex - 2; + } + + if (startIndex <= endIndex) { + return buckets.subList(startIndex, endIndex + 1); + } else { + return Lists.newArrayList(); } - return Lists.newArrayList(); } public Optional getRowCountInBucket(ConstantOperator constantOperator, double distinctValuesCount) { From 197f9d739a9e10de381e0d101b635f3ad431a7d6 Mon Sep 17 00:00:00 2001 From: Murphy Date: Fri, 18 Oct 2024 16:18:54 +0800 Subject: [PATCH 6/9] handle corner case of bucket Signed-off-by: Murphy --- .../statistics/HistogramEstimator.java | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java index a585c08c9c8a1..973f846d07e4a 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java @@ -84,13 +84,23 @@ private static double calculateBucketOverlap(Bucket leftBucket, Bucket rightBuck double rightRange = rightUpper - rightLower; double overlapRange = overlapUpper - overlapLower; - double leftOverlapRatio = overlapRange / leftRange; - double rightOverlapRatio = overlapRange / rightRange; + double leftOverlapCount; + if (leftRange <= 0) { + leftOverlapCount = leftBucket.getUpperRepeats(); + } else { + double leftOverlapRatio = overlapRange / leftRange; + leftOverlapCount = leftBucket.getCount() * leftOverlapRatio; + } - // Estimate the count of overlapping elements - double overlapCount = - Math.min(leftBucket.getCount() * leftOverlapRatio, rightBucket.getCount() * rightOverlapRatio); + double rightOverlapCount; + if (rightRange <= 0) { + rightOverlapCount = rightBucket.getUpperRepeats(); + } else { + double rightOverlapRatio = overlapRange / rightRange; + rightOverlapCount = rightBucket.getCount() * rightOverlapRatio; + } - return overlapCount; + // Estimate the count of overlapping elements + return Math.min(leftOverlapCount, rightOverlapCount); } } From 8dd3041e8875d7a0742a66e0ea7e39318d4f7d87 Mon Sep 17 00:00:00 2001 From: Murphy Date: Thu, 24 Oct 2024 14:10:36 +0800 Subject: [PATCH 7/9] remove dead code Signed-off-by: Murphy --- .../sql/optimizer/statistics/Bucket.java | 17 +---------------- .../statistics/StatisticRangeValues.java | 13 ------------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java index 8c3fba7690ece..2f0719fb16432 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java @@ -15,11 +15,9 @@ package com.starrocks.sql.optimizer.statistics; -import org.jetbrains.annotations.NotNull; - import java.util.Objects; -public class Bucket implements Comparable { +public class Bucket { private final double lower; private final double upper; private final Long count; @@ -48,19 +46,6 @@ public Long getUpperRepeats() { return upperRepeats; } - public boolean isOverlapped(Bucket other) { - return Math.min(upper, other.upper) - Math.max(lower, other.lower) >= 0; - } - - @Override - public int compareTo(@NotNull Bucket o) { - int lowerComparison = Double.compare(this.lower, o.lower); - if (lowerComparison != 0) { - return lowerComparison; - } - return Double.compare(this.upper, o.upper); - } - @Override public boolean equals(Object o) { if (this == o) { diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java index 9bf30cffa1cf7..18ca49e6eea97 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticRangeValues.java @@ -111,19 +111,6 @@ public double overlapPercentWith(@NotNull StatisticRangeValues other) { return StatisticsEstimateCoefficient.OVERLAP_INFINITE_RANGE_FILTER_COEFFICIENT; } - double overlapLength(StatisticRangeValues other) { - if (this.isEmpty() || other.isEmpty()) { - return 0.0; - } - // If the low and high values is infinite, it represents either string type or unknown of column statistics. - if (this.equals(other) && !isBothInfinite()) { - return 1.0; - } - - double lengthOfIntersect = min(this.high, other.high) - max(this.low, other.low); - return lengthOfIntersect; - } - public StatisticRangeValues intersect(StatisticRangeValues other) { double newLow = max(low, other.low); double newHigh = min(high, other.high); From 1bb6ba86df1cf610283fc2a58e52359dc4c9bcd9 Mon Sep 17 00:00:00 2001 From: Murphy Date: Fri, 25 Oct 2024 10:12:08 +0800 Subject: [PATCH 8/9] piggyback histogram to join children Signed-off-by: Murphy --- .../sql/optimizer/statistics/StatisticsCalculator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java index 8e87cddafee3d..9109b29c54c99 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java @@ -1109,8 +1109,8 @@ private Void computeJoinNode(ExpressionContext context, JoinOperator joinType, S Statistics rightStatistics = context.getChildStatistics(1); // construct cross join statistics Statistics.Builder crossBuilder = Statistics.builder(); - crossBuilder.addColumnStatisticsFromOtherStatistic(leftStatistics, context.getChildOutputColumns(0), false); - crossBuilder.addColumnStatisticsFromOtherStatistic(rightStatistics, context.getChildOutputColumns(1), false); + crossBuilder.addColumnStatisticsFromOtherStatistic(leftStatistics, context.getChildOutputColumns(0), true); + crossBuilder.addColumnStatisticsFromOtherStatistic(rightStatistics, context.getChildOutputColumns(1), true); double leftRowCount = leftStatistics.getOutputRowCount(); double rightRowCount = rightStatistics.getOutputRowCount(); double crossRowCount = StatisticUtils.multiplyRowCount(leftRowCount, rightRowCount); From 5c2d2fedf4a54fc5f872b497f33e7d1076d7747b Mon Sep 17 00:00:00 2001 From: Murphy Date: Fri, 25 Oct 2024 11:06:55 +0800 Subject: [PATCH 9/9] handle UpperRepeats of each bucket Signed-off-by: Murphy --- .../sql/optimizer/statistics/Bucket.java | 5 ++++ .../sql/optimizer/statistics/Histogram.java | 5 ++++ .../statistics/HistogramEstimator.java | 19 +++++++++---- .../statistics/HistogramEstimatorTest.java | 28 ++++++++++++++++++- 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java index 2f0719fb16432..0c1169e515567 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Bucket.java @@ -64,4 +64,9 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(lower, upper, count, upperRepeats); } + + @Override + public String toString() { + return String.format("[%f,%f,%d,%d]", lower, upper, count, upperRepeats); + } } \ No newline at end of file diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java index e4e5f9fb53d05..af46600235748 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/Histogram.java @@ -151,4 +151,9 @@ public Histogram build() { return new Histogram(buckets, mcv); } } + + @Override + public String toString() { + return "Histogram(buckets=" + buckets + ",mcv=" + mcv + ")"; + } } diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java index 973f846d07e4a..63a3fdc048f2c 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/HistogramEstimator.java @@ -15,6 +15,7 @@ package com.starrocks.sql.optimizer.statistics; import com.google.common.base.Preconditions; +import org.apache.commons.collections.CollectionUtils; /** * Use histogram to estimate cardinality @@ -39,6 +40,10 @@ public static Double estimateEqualToSelectivity(ColumnStatistic left, ColumnStat if (leftHistogram == null || rightHistogram == null) { return null; } + if (CollectionUtils.isEmpty(leftHistogram.getBuckets()) && + CollectionUtils.isEmpty(rightHistogram.getBuckets())) { + return null; + } // Calculate the overlapping area of the two histograms double overlapArea = 0.0; @@ -74,11 +79,6 @@ private static double calculateBucketOverlap(Bucket leftBucket, Bucket rightBuck double overlapLower = Math.max(leftLower, rightLower); double overlapUpper = Math.min(leftUpper, rightUpper); - // If there's no overlap, return 0 - if (overlapLower >= overlapUpper) { - return 0; - } - // Calculate overlap ratio double leftRange = leftUpper - leftLower; double rightRange = rightUpper - rightLower; @@ -90,6 +90,12 @@ private static double calculateBucketOverlap(Bucket leftBucket, Bucket rightBuck } else { double leftOverlapRatio = overlapRange / leftRange; leftOverlapCount = leftBucket.getCount() * leftOverlapRatio; + // left: [lower, upper] + // right: [lower, upper] + // upper repeats should be excluded + if (leftUpper > rightUpper) { + leftOverlapCount -= leftBucket.getUpperRepeats() * leftOverlapRatio; + } } double rightOverlapCount; @@ -98,6 +104,9 @@ private static double calculateBucketOverlap(Bucket leftBucket, Bucket rightBuck } else { double rightOverlapRatio = overlapRange / rightRange; rightOverlapCount = rightBucket.getCount() * rightOverlapRatio; + if (leftUpper < rightUpper) { + rightOverlapCount -= rightBucket.getUpperRepeats() * rightOverlapRatio; + } } // Estimate the count of overlapping elements diff --git a/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java index d2f2897743380..6a433862cad46 100644 --- a/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java +++ b/fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/HistogramEstimatorTest.java @@ -39,7 +39,8 @@ public void testEstimateEqualToSelectivity( Assertions.assertNull(actualSelectivity); } else { Assertions.assertNotNull(actualSelectivity); - assertEquals(expectedSelectivity, actualSelectivity, 0.01); + assertEquals(expectedSelectivity, actualSelectivity, 0.01, "left histogram: " + left.getHistogram() + + "\nright histogram: " + right.getHistogram()); } } @@ -83,6 +84,20 @@ private static Stream provideTestCases() { createColumnStatistic(createUniformedHistogram(10, 10240, 1 << 16)), 1.0), + // low-cardinality single element histogram + Arguments.of( + createColumnStatistic(createSingleElementHistogram(100, 1024, 1 << 10)), + createColumnStatistic(createSingleElementHistogram(100, 1024, 1 << 10)), + 1.0), + Arguments.of( + createColumnStatistic(createSingleElementHistogram(100, 1024, 1 << 10)), + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 16)), + 0.0), + Arguments.of( + createColumnStatistic(createUniformedHistogram(100, 1024, 1 << 16)), + createColumnStatistic(createSingleElementHistogram(100, 1024, 1 << 10)), + 0.0), + // Completely overlapping histograms Arguments.of( createColumnStatistic(new double[] {1, 5, 10}, new long[] {100, 200}), @@ -120,6 +135,17 @@ private static Histogram createUniformedHistogram(int numBuckets, double bucketR return builder.build(); } + // upper == lower + private static Histogram createSingleElementHistogram(int numBuckets, double bucketRange, long perBucketCount) { + Histogram.Builder builder = new Histogram.Builder(); + double lower = 0.0; + for (int i = 0; i < numBuckets; i++) { + builder.addBucket(new Bucket(lower, lower, perBucketCount, perBucketCount)); + lower += bucketRange; + } + return builder.build(); + } + // create an empty column statistics private static ColumnStatistic createColumnStatistic() { return new ColumnStatistic(0, 0, 0, 0, 0, null, ColumnStatistic.StatisticType.ESTIMATE);