Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] support histogram-based join selectivity estimation #52098

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ public class SessionVariable implements Serializable, Writable, Cloneable {
public static final String CBO_MAX_REORDER_NODE = "cbo_max_reorder_node";
public static final String CBO_PRUNE_SHUFFLE_COLUMN_RATE = "cbo_prune_shuffle_column_rate";
public static final String CBO_PUSH_DOWN_AGGREGATE_MODE = "cbo_push_down_aggregate_mode";
public static final String CBO_ENABLE_HISTOGRAM_JOIN_ESTIMATION = "cbo_enable_histogram_join_estimation";

public static final String CBO_PUSH_DOWN_DISTINCT_BELOW_WINDOW = "cbo_push_down_distinct_below_window";
public static final String CBO_PUSH_DOWN_AGGREGATE = "cbo_push_down_aggregate";
Expand Down Expand Up @@ -1490,6 +1491,9 @@ public static MaterializedViewRewriteMode parse(String str) {
@VarAttr(name = CBO_PUSH_DOWN_GROUPINGSET_RESHUFFLE, flag = VariableMgr.INVISIBLE)
private boolean cboPushDownGroupingSetReshuffle = true;

@VarAttr(name = CBO_ENABLE_HISTOGRAM_JOIN_ESTIMATION, flag = VariableMgr.INVISIBLE)
private boolean cboEnableHistogramJoinEstimation = true;

@VariableMgr.VarAttr(name = PARSE_TOKENS_LIMIT)
private int parseTokensLimit = 3500000;

Expand Down Expand Up @@ -3479,6 +3483,10 @@ public void setCboPushDownDistinctBelowWindow(boolean flag) {
this.cboPushDownDistinctBelowWindow = flag;
}

public boolean isCboEnableHistogramJoinEstimation() {
return cboEnableHistogramJoinEstimation;
}

public boolean isCboPushDownDistinctBelowWindow() {
return this.cboPushDownDistinctBelowWindow;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
package com.starrocks.sql.optimizer.statistics;

import com.starrocks.analysis.BinaryType;
import com.starrocks.qe.ConnectContext;
import com.starrocks.sql.optimizer.operator.scalar.BinaryPredicateOperator;
import com.starrocks.sql.optimizer.operator.scalar.ColumnRefOperator;
import com.starrocks.sql.optimizer.operator.scalar.ConstantOperator;
Expand Down Expand Up @@ -295,15 +296,45 @@ public static Statistics estimateColumnToColumnComparison(ScalarOperator leftCol
}
}

/**
* Estimate selectivity based on domain contains assumption:
* selectivity = 1/max{NDV}
* It's not robust if the NDV is distorted, which usually lead to underestimated selectivity
*/
private static double estimateSelectivityWithNDV(ColumnStatistic leftColumnStatistic,
ColumnStatistic rightColumnStatistic) {
double leftDistinctValuesCount = leftColumnStatistic.getDistinctValuesCount();
double rightDistinctValuesCount = rightColumnStatistic.getDistinctValuesCount();
return 1.0 / Math.max(1, Math.max(leftDistinctValuesCount, rightDistinctValuesCount));
}

/**
* Estimate selectivity based on histogram:
* selectivity = sum{ overlap_area/total_area of all-buckets }
*/
private static Double estimateSelectivityWithHistogram(ColumnStatistic leftColumnStatistic,
ColumnStatistic rightColumnStatistic) {
ConnectContext context = ConnectContext.get();
if (context == null || !context.getSessionVariable().isCboEnableHistogramJoinEstimation()) {
return null;
}
return HistogramEstimator.estimateEqualToSelectivity(leftColumnStatistic, rightColumnStatistic);
}

public static Statistics estimateColumnEqualToColumn(ScalarOperator leftColumn,
ColumnStatistic leftColumnStatistic,
ScalarOperator rightColumn,
ColumnStatistic rightColumnStatistic,
Statistics statistics,
boolean isEqualForNull) {
double leftDistinctValuesCount = leftColumnStatistic.getDistinctValuesCount();
double rightDistinctValuesCount = rightColumnStatistic.getDistinctValuesCount();
double selectivity = 1.0 / Math.max(1, Math.max(leftDistinctValuesCount, rightDistinctValuesCount));
double selectivity;
Double histogramSelectivity = estimateSelectivityWithHistogram(leftColumnStatistic, rightColumnStatistic);
if (histogramSelectivity != null) {
selectivity = histogramSelectivity;
} else {
selectivity = estimateSelectivityWithNDV(leftColumnStatistic, rightColumnStatistic);
}

double rowCount = statistics.getOutputRowCount() * selectivity *
(isEqualForNull ? 1 :
(1 - leftColumnStatistic.getNullsFraction()) * (1 - rightColumnStatistic.getNullsFraction()));
murphyatwork marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

package com.starrocks.sql.optimizer.statistics;

import java.util.Objects;

public class Bucket {
private final double lower;
private final double upper;
Expand Down Expand Up @@ -43,4 +45,28 @@ public Long getCount() {
public Long getUpperRepeats() {
return upperRepeats;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Bucket bucket = (Bucket) o;
return Double.compare(lower, bucket.lower) == 0 && Double.compare(upper, bucket.upper) == 0 &&
Objects.equals(count, bucket.count) &&
Objects.equals(upperRepeats, bucket.upperRepeats);
}

@Override
public int hashCode() {
return Objects.hash(lower, upper, count, upperRepeats);
}

@Override
public String toString() {
return String.format("[%f,%f,%d,%d]", lower, upper, count, upperRepeats);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@

package com.starrocks.sql.optimizer.statistics;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.starrocks.sql.optimizer.operator.scalar.ConstantOperator;
import com.starrocks.statistic.StatisticUtils;

import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -58,11 +61,33 @@ public String getMcvString() {
sb.append("MCV: [");
mcv.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
.limit(printMcvSize)
.forEach(entry -> sb.append("[").append(entry.getKey()).append(":").append(entry.getValue()).append("]"));
.forEach(entry -> sb.append("[").append(entry.getKey()).append(":").append(entry.getValue())
.append("]"));
sb.append("]");
return sb.toString();
}

public List<Bucket> getOverlappedBuckets(double lower, double upper) {
int startIndex = Collections.binarySearch(buckets, new Bucket(lower, lower, 0L, 0L),
Comparator.comparingDouble(Bucket::getUpper));
if (startIndex < 0) {
startIndex = -startIndex - 1;
}

// Find the first bucket that overlaps with the upper bound
int endIndex = Collections.binarySearch(buckets, new Bucket(upper, upper, 0L, 0L),
Comparator.comparingDouble(Bucket::getLower));
if (endIndex < 0) {
endIndex = -endIndex - 2;
}

if (startIndex <= endIndex) {
return buckets.subList(startIndex, endIndex + 1);
} else {
return Lists.newArrayList();
}
}

public Optional<Long> getRowCountInBucket(ConstantOperator constantOperator, double distinctValuesCount) {
Optional<Double> valueOpt = StatisticUtils.convertStatisticsToDouble(constantOperator.getType(),
constantOperator.toString());
Expand All @@ -86,9 +111,11 @@ public Optional<Long> getRowCountInBucket(ConstantOperator constantOperator, dou
}

if (constantOperator.getType().isFixedPointType()) {
rowCount = (long) Math.ceil(Math.max(1, rowCount / Math.max(1, (bucket.getUpper() - bucket.getLower()))));
rowCount = (long) Math.ceil(
Math.max(1, rowCount / Math.max(1, (bucket.getUpper() - bucket.getLower()))));
} else {
rowCount = (long) Math.ceil(Math.max(1, rowCount / Math.max(1, distinctValuesCount / buckets.size())));
rowCount =
(long) Math.ceil(Math.max(1, rowCount / Math.max(1, distinctValuesCount / buckets.size())));
}

return Optional.of(rowCount);
Expand All @@ -105,4 +132,28 @@ public Optional<Long> getRowCountInBucket(ConstantOperator constantOperator, dou

return Optional.empty();
}

static class Builder {
private final List<Bucket> buckets = Lists.newArrayList();
private final Map<String, Long> mcv = Maps.newHashMap();

public Builder addBucket(Bucket bucket) {
this.buckets.add(bucket);
return this;
}

public Builder addCommonValue(String key, Long count) {
this.mcv.put(key, count);
return this;
}

public Histogram build() {
return new Histogram(buckets, mcv);
}
}

@Override
public String toString() {
return "Histogram(buckets=" + buckets + ",mcv=" + mcv + ")";
}
}
murphyatwork marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.starrocks.sql.optimizer.statistics;

import com.google.common.base.Preconditions;
import org.apache.commons.collections.CollectionUtils;

/**
* Use histogram to estimate cardinality
*/
public class HistogramEstimator {

/**
* Estimate the selectivity of two columns with EqualTo operator
* Return null if fail to do the estimation
*/
public static Double estimateEqualToSelectivity(ColumnStatistic left, ColumnStatistic right) {
// Check if input parameters are valid
if (left == null || right == null) {
return null;
}

// Get histograms
Histogram leftHistogram = left.getHistogram();
Histogram rightHistogram = right.getHistogram();

// If either histogram is empty, estimation is not possible
if (leftHistogram == null || rightHistogram == null) {
return null;
}
if (CollectionUtils.isEmpty(leftHistogram.getBuckets()) &&
CollectionUtils.isEmpty(rightHistogram.getBuckets())) {
return null;
}

// Calculate the overlapping area of the two histograms
double overlapArea = 0.0;
double totalArea = 0.0;

for (Bucket leftBucket : leftHistogram.getBuckets()) {
for (Bucket rightBucket :
rightHistogram.getOverlappedBuckets(leftBucket.getLower(), leftBucket.getUpper())) {
double overlap = calculateBucketOverlap(leftBucket, rightBucket);
overlapArea += overlap;
}
totalArea += leftBucket.getCount();
}

// Calculate selectivity
if (totalArea > 0) {
double selectivity = overlapArea / totalArea;
Preconditions.checkState(0.0 <= selectivity && selectivity <= 1.0,
"exceptional selectivity: " + selectivity);
return overlapArea / totalArea;
} else {
return null;
}
}

private static double calculateBucketOverlap(Bucket leftBucket, Bucket rightBucket) {
double leftLower = leftBucket.getLower();
double leftUpper = leftBucket.getUpper();
double rightLower = rightBucket.getLower();
double rightUpper = rightBucket.getUpper();

// Calculate overlap interval
double overlapLower = Math.max(leftLower, rightLower);
double overlapUpper = Math.min(leftUpper, rightUpper);

// Calculate overlap ratio
double leftRange = leftUpper - leftLower;
double rightRange = rightUpper - rightLower;
double overlapRange = overlapUpper - overlapLower;

double leftOverlapCount;
if (leftRange <= 0) {
leftOverlapCount = leftBucket.getUpperRepeats();
} else {
double leftOverlapRatio = overlapRange / leftRange;
leftOverlapCount = leftBucket.getCount() * leftOverlapRatio;
// left: [lower, upper]
// right: [lower, upper]
// upper repeats should be excluded
if (leftUpper > rightUpper) {
leftOverlapCount -= leftBucket.getUpperRepeats() * leftOverlapRatio;
}
}

double rightOverlapCount;
if (rightRange <= 0) {
rightOverlapCount = rightBucket.getUpperRepeats();
} else {
double rightOverlapRatio = overlapRange / rightRange;
rightOverlapCount = rightBucket.getCount() * rightOverlapRatio;
if (leftUpper < rightUpper) {
rightOverlapCount -= rightBucket.getUpperRepeats() * rightOverlapRatio;
}
}

// Estimate the count of overlapping elements
return Math.min(leftOverlapCount, rightOverlapCount);
}
}
murphyatwork marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -1109,8 +1109,8 @@ private Void computeJoinNode(ExpressionContext context, JoinOperator joinType, S
Statistics rightStatistics = context.getChildStatistics(1);
// construct cross join statistics
Statistics.Builder crossBuilder = Statistics.builder();
crossBuilder.addColumnStatisticsFromOtherStatistic(leftStatistics, context.getChildOutputColumns(0), false);
crossBuilder.addColumnStatisticsFromOtherStatistic(rightStatistics, context.getChildOutputColumns(1), false);
crossBuilder.addColumnStatisticsFromOtherStatistic(leftStatistics, context.getChildOutputColumns(0), true);
crossBuilder.addColumnStatisticsFromOtherStatistic(rightStatistics, context.getChildOutputColumns(1), true);
double leftRowCount = leftStatistics.getOutputRowCount();
double rightRowCount = rightStatistics.getOutputRowCount();
double crossRowCount = StatisticUtils.multiplyRowCount(leftRowCount, rightRowCount);
Expand Down
Loading
Loading