diff --git a/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp b/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp index 8aec1cd02c46..2374121a94f3 100644 --- a/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp +++ b/src/include/duckdb/optimizer/join_order/cardinality_estimator.hpp @@ -18,11 +18,11 @@ class FilterInfo; struct DenomInfo { DenomInfo(JoinRelationSet &numerator_relations, double filter_strength, double denominator) - : numerator_relations(numerator_relations), filter_strength(filter_strength), denominator(denominator) { + : numerator_relations(numerator_relations), extra_multiplier(filter_strength), denominator(denominator) { } JoinRelationSet &numerator_relations; - double filter_strength; + double extra_multiplier; double denominator; }; @@ -61,9 +61,11 @@ class FilterInfoWithTotalDomains { struct Subgraph2Denominator { optional_ptr relations; optional_ptr numerator_relations; + double numerator_relations_extra; double denom; - Subgraph2Denominator() : relations(nullptr), numerator_relations(nullptr), denom(1) {}; + Subgraph2Denominator() + : relations(nullptr), numerator_relations(nullptr), numerator_relations_extra(1), denom(1) {}; }; class CardinalityHelper { @@ -89,6 +91,7 @@ class CardinalityEstimator { public: static constexpr double DEFAULT_SEMI_ANTI_SELECTIVITY = 5; static constexpr double DEFAULT_LT_GT_MULTIPLIER = 2.5; + static constexpr double LEFT_JOIN_COEFFICIENT = 0.008; explicit CardinalityEstimator() {}; private: diff --git a/src/include/duckdb/optimizer/join_order/cost_model.hpp b/src/include/duckdb/optimizer/join_order/cost_model.hpp index d6a9245b8ed3..bddbe3bd0f9f 100644 --- a/src/include/duckdb/optimizer/join_order/cost_model.hpp +++ b/src/include/duckdb/optimizer/join_order/cost_model.hpp @@ -27,7 +27,8 @@ class CostModel { void InitCostModel(); //! Compute cost of a join relation set - double ComputeCost(DPJoinNode &left, DPJoinNode &right); + double ComputeCost(DPJoinNode &left, DPJoinNode &right, NeighborInfo &connection); + double ComputeJoinCost(DPJoinNode &left, DPJoinNode &right); //! Cardinality Estimator used to calculate cost CardinalityEstimator cardinality_estimator; diff --git a/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp b/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp index 921e03a21423..4c99a2e5e014 100644 --- a/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp +++ b/src/include/duckdb/optimizer/join_order/join_order_optimizer.hpp @@ -11,15 +11,10 @@ #include "duckdb/common/unordered_map.hpp" #include "duckdb/common/unordered_set.hpp" #include "duckdb/optimizer/join_order/cardinality_estimator.hpp" -#include "duckdb/optimizer/join_order/join_node.hpp" -#include "duckdb/optimizer/join_order/join_relation.hpp" -#include "duckdb/optimizer/join_order/query_graph.hpp" +#include "duckdb/optimizer/column_binding_replacer.hpp" #include "duckdb/optimizer/join_order/query_graph_manager.hpp" #include "duckdb/parser/expression_map.hpp" #include "duckdb/planner/logical_operator.hpp" -#include "duckdb/planner/logical_operator_visitor.hpp" - -#include namespace duckdb { @@ -30,7 +25,8 @@ class JoinOrderOptimizer { public: //! Perform join reordering inside a plan - unique_ptr Optimize(unique_ptr plan, optional_ptr stats = nullptr); + unique_ptr Optimize(unique_ptr plan, optional_ptr stats = nullptr, + bool remove_projections = false); //! Adds/gets materialized CTE stats void AddMaterializedCTEStats(idx_t index, RelationStats &&stats); RelationStats GetMaterializedCTEStats(idx_t index); @@ -62,4 +58,15 @@ class JoinOrderOptimizer { optional_ptr delim_scan_stats; }; +class RemoveUnnecessaryProjections { +public: + explicit RemoveUnnecessaryProjections(); + unique_ptr RemoveProjections(unique_ptr plan); + unique_ptr RemoveProjectionsChildren(unique_ptr plan); + ColumnBindingReplacer replacer; + +private: + bool first_projection; +}; + } // namespace duckdb diff --git a/src/include/duckdb/optimizer/join_order/join_relation.hpp b/src/include/duckdb/optimizer/join_order/join_relation.hpp index 7b040c1b5ef4..0d48d00d770b 100644 --- a/src/include/duckdb/optimizer/join_order/join_relation.hpp +++ b/src/include/duckdb/optimizer/join_order/join_relation.hpp @@ -9,8 +9,8 @@ #pragma once #include "duckdb/common/common.hpp" -#include "duckdb/common/unordered_map.hpp" #include "duckdb/common/unordered_set.hpp" +#include "duckdb/common/optional_ptr.hpp" namespace duckdb { @@ -20,6 +20,9 @@ struct JoinRelationSet { } string ToString() const; + bool Empty() { + return count == 0; + } unsafe_unique_array relations; idx_t count; @@ -39,6 +42,7 @@ class JoinRelationSetManager { }; public: + JoinRelationSet &GetEmptyJoinRelationSet(); //! Create or get a JoinRelationSet from a single node with the given index JoinRelationSet &GetJoinRelation(idx_t index); //! Create or get a JoinRelationSet from a set of relation bindings @@ -54,6 +58,7 @@ class JoinRelationSetManager { private: JoinRelationTreeNode root; + optional_ptr empty_relation_set; }; } // namespace duckdb diff --git a/src/include/duckdb/optimizer/join_order/query_graph_manager.hpp b/src/include/duckdb/optimizer/join_order/query_graph_manager.hpp index e98868115af3..a2b15c1576b0 100644 --- a/src/include/duckdb/optimizer/join_order/query_graph_manager.hpp +++ b/src/include/duckdb/optimizer/join_order/query_graph_manager.hpp @@ -9,21 +9,15 @@ #pragma once #include "duckdb/common/common.hpp" -#include "duckdb/common/enums/join_type.hpp" #include "duckdb/common/optional_ptr.hpp" -#include "duckdb/common/pair.hpp" #include "duckdb/common/unordered_map.hpp" -#include "duckdb/common/unordered_set.hpp" #include "duckdb/common/vector.hpp" #include "duckdb/optimizer/join_order/join_node.hpp" #include "duckdb/optimizer/join_order/join_relation.hpp" #include "duckdb/optimizer/join_order/query_graph.hpp" #include "duckdb/optimizer/join_order/relation_manager.hpp" -#include "duckdb/planner/column_binding.hpp" #include "duckdb/planner/logical_operator.hpp" -#include - namespace duckdb { class QueryGraphEdges; @@ -37,29 +31,6 @@ struct GenerateJoinRelation { unique_ptr op; }; -//! Filter info struct that is used by the cardinality estimator to set the initial cardinality -//! but is also eventually transformed into a query edge. -class FilterInfo { -public: - FilterInfo(unique_ptr filter, JoinRelationSet &set, idx_t filter_index, - JoinType join_type = JoinType::INNER) - : filter(std::move(filter)), set(set), filter_index(filter_index), join_type(join_type) { - } - -public: - unique_ptr filter; - reference set; - idx_t filter_index; - JoinType join_type; - optional_ptr left_set; - optional_ptr right_set; - ColumnBinding left_binding; - ColumnBinding right_binding; - - void SetLeftSet(optional_ptr left_set_new); - void SetRightSet(optional_ptr right_set_new); -}; - //! The QueryGraphManager manages the process of extracting the reorderable and nonreorderable operations //! from the logical plan and creating the intermediate structures needed by the plan enumerator. //! When the plan enumerator finishes, the Query Graph Manger can then recreate the logical plan. @@ -94,7 +65,7 @@ class QueryGraphManager { //! products to create edges. void CreateQueryGraphCrossProduct(JoinRelationSet &left, JoinRelationSet &right); - //! A map to store the optimal join plan found for a specific JoinRelationSet* + //! A map to store the optimal join plan found for a specific JoinRelationSet optional_ptr>> plans; private: @@ -106,8 +77,6 @@ class QueryGraphManager { QueryGraphEdges query_graph; - void GetColumnBinding(Expression &expression, ColumnBinding &binding); - void CreateHyperGraphEdges(); GenerateJoinRelation GenerateJoins(vector> &extracted_relations, JoinRelationSet &set); diff --git a/src/include/duckdb/optimizer/join_order/relation_manager.hpp b/src/include/duckdb/optimizer/join_order/relation_manager.hpp index 3b8fda1c67f6..7227bb8cb892 100644 --- a/src/include/duckdb/optimizer/join_order/relation_manager.hpp +++ b/src/include/duckdb/optimizer/join_order/relation_manager.hpp @@ -11,17 +11,18 @@ #include "duckdb/common/unordered_map.hpp" #include "duckdb/common/unordered_set.hpp" #include "duckdb/optimizer/join_order/cardinality_estimator.hpp" -#include "duckdb/optimizer/join_order/join_node.hpp" #include "duckdb/optimizer/join_order/join_relation.hpp" #include "duckdb/optimizer/join_order/relation_statistics_helper.hpp" #include "duckdb/parser/expression_map.hpp" #include "duckdb/planner/logical_operator.hpp" #include "duckdb/planner/logical_operator_visitor.hpp" +#include "duckdb/common/enums/join_type.hpp" +#include "duckdb/parser/expression/conjunction_expression.hpp" + namespace duckdb { class JoinOrderOptimizer; -class FilterInfo; //! Represents a single relation and any metadata accompanying that relation struct SingleJoinRelation { @@ -36,6 +37,40 @@ struct SingleJoinRelation { } }; +//! FilterInfo models strores filter information so that edges between relations can be made +//! with the original ColumnBinding information available so that the cardinality estimator can +//! view the statistics of the underlying base tables. +class FilterInfo { +public: + FilterInfo(unique_ptr filter, optional_ptr set, idx_t filter_index, JoinType join_type, + optional_ptr left_relation_set, optional_ptr right_relation_set, + ColumnBinding left_binding, ColumnBinding right_binding) + : filter(std::move(filter)), set(set), filter_index(filter_index), join_type(join_type), + left_relation_set(left_relation_set), right_relation_set(right_relation_set), left_binding(left_binding), + right_binding(right_binding) { + } + FilterInfo(unique_ptr filter, optional_ptr set, idx_t filter_index, JoinType join_type, + optional_ptr left_relation_set, optional_ptr right_relation_set) + : filter(std::move(filter)), set(set), filter_index(filter_index), join_type(join_type), + left_relation_set(left_relation_set), right_relation_set(right_relation_set) { + } + +public: + unique_ptr filter; + optional_ptr set; + idx_t filter_index; + JoinType join_type; + optional_ptr left_relation_set; + optional_ptr right_relation_set; + // TODO: change this to be a binding set + ColumnBinding left_binding; + ColumnBinding right_binding; + + void SetLeftSet(optional_ptr left_set_new); + void SetRightSet(optional_ptr right_set_new); + bool SingleColumnFilter(); +}; + class RelationManager { public: explicit RelationManager(ClientContext &context) : context(context) { @@ -49,12 +84,26 @@ class RelationManager { //! for each join filter in the logical plan op, extract the relations that are referred to on //! both sides of the join filter, along with the tables & indexes. - vector> ExtractEdges(LogicalOperator &op, - vector> &filter_operators, + vector> ExtractEdges(vector> &filter_operators, JoinRelationSetManager &set_manager); - //! Extract the set of relations referred to inside an expression - bool ExtractBindings(Expression &expression, unordered_set &bindings); + //! Extract all column bindings from an expression with the + void ExtractColumnBindingsFromExpression(Expression &expression, unordered_set &bindings); + //! Extract the Column binding from an expression + void ExtractColumnBinding(Expression &expression, ColumnBinding &binding); + // Inspects an expression and creates filter info instances that can connect two relations + // If the expreession (or conjunction expression children cannot create a FilterInfo), then + // they are returned to be added to the filter_op so they are pushed down at the end of reconstruction. + vector> CreateFilterInfoFromExpression(unique_ptr expr, + JoinRelationSetManager &set_manager, + JoinType join_type = JoinType::INNER); + vector> + CreateFilterFromConjunctionChildren(unique_ptr conjunction_expression, + JoinRelationSetManager &set_manager, JoinType join_type); + + optional_ptr GetJoinRelations(column_binding_set_t &column_bindings, + JoinRelationSetManager &set_manager); + void GetColumnBindingsFromExpression(Expression &expression, column_binding_set_t &column_bindings); void AddRelation(LogicalOperator &op, optional_ptr parent, const RelationStats &stats); void AddAggregateOrWindowRelation(LogicalOperator &op, optional_ptr parent, @@ -74,6 +123,10 @@ class RelationManager { //! Set of all relations considered in the join optimizer vector> relations; unordered_set no_cross_product_relations; + + //! used when extracting edges from the relations. They are then passed to the + //! query graph manager. + vector> filter_infos_; }; } // namespace duckdb diff --git a/src/optimizer/join_order/cardinality_estimator.cpp b/src/optimizer/join_order/cardinality_estimator.cpp index 4b5e22adad1e..99b30fed431b 100644 --- a/src/optimizer/join_order/cardinality_estimator.cpp +++ b/src/optimizer/join_order/cardinality_estimator.cpp @@ -14,14 +14,14 @@ namespace duckdb { // The filter was made on top of a logical sample or other projection, // but no specific columns are referenced. See issue 4978 number 4. bool CardinalityEstimator::EmptyFilter(FilterInfo &filter_info) { - if (!filter_info.left_set && !filter_info.right_set) { + if (filter_info.left_relation_set->count == 0 && filter_info.right_relation_set->count == 0) { return true; } return false; } void CardinalityEstimator::AddRelationTdom(FilterInfo &filter_info) { - D_ASSERT(filter_info.set.get().count >= 1); + D_ASSERT(filter_info.set->count >= 1); for (const RelationsToTDom &r2tdom : relations_to_tdoms) { auto &i_set = r2tdom.equivalent_relations; if (i_set.find(filter_info.left_binding) != i_set.end()) { @@ -37,7 +37,8 @@ void CardinalityEstimator::AddRelationTdom(FilterInfo &filter_info) { } bool CardinalityEstimator::SingleColumnFilter(duckdb::FilterInfo &filter_info) { - if (filter_info.left_set && filter_info.right_set && filter_info.set.get().count > 1) { + if (filter_info.left_relation_set->count >= 1 && filter_info.right_relation_set->count >= 1 && + filter_info.set->count >= 2) { // Both set and are from different relations return false; } @@ -103,16 +104,11 @@ void CardinalityEstimator::InitEquivalentRelations(const vectorSingleColumnFilter()) { continue; } - D_ASSERT(filter->left_set->count >= 1); - D_ASSERT(filter->right_set->count >= 1); + D_ASSERT(filter->left_relation_set->count >= 1); + D_ASSERT(filter->right_relation_set->count >= 1); auto matching_equivalent_sets = DetermineMatchingEquivalentSets(filter.get()); AddToEquivalenceSets(filter.get(), matching_equivalent_sets); @@ -137,14 +133,14 @@ double CardinalityEstimator::GetNumerator(JoinRelationSet &set) { } bool EdgeConnects(FilterInfoWithTotalDomains &edge, Subgraph2Denominator &subgraph) { - if (edge.filter_info->left_set) { - if (JoinRelationSet::IsSubset(*subgraph.relations, *edge.filter_info->left_set)) { + if (edge.filter_info->left_relation_set) { + if (JoinRelationSet::IsSubset(*subgraph.relations, *edge.filter_info->left_relation_set)) { // cool return true; } } - if (edge.filter_info->right_set) { - if (JoinRelationSet::IsSubset(*subgraph.relations, *edge.filter_info->right_set)) { + if (edge.filter_info->right_relation_set) { + if (JoinRelationSet::IsSubset(*subgraph.relations, *edge.filter_info->right_relation_set)) { return true; } } @@ -156,7 +152,8 @@ vector GetEdges(vector &relations_t vector res; for (auto &relation_2_tdom : relations_to_tdom) { for (auto &filter : relation_2_tdom.filters) { - if (JoinRelationSet::IsSubset(requested_set, filter->set)) { + if (JoinRelationSet::IsSubset(requested_set, *filter->set) && + filter->left_relation_set != filter->right_relation_set) { FilterInfoWithTotalDomains new_edge(filter, relation_2_tdom); res.push_back(new_edge); } @@ -199,8 +196,8 @@ JoinRelationSet &CardinalityEstimator::UpdateNumeratorRelations(Subgraph2Denomin switch (filter.filter_info->join_type) { case JoinType::SEMI: case JoinType::ANTI: { - if (JoinRelationSet::IsSubset(*left.relations, *filter.filter_info->left_set) && - JoinRelationSet::IsSubset(*right.relations, *filter.filter_info->right_set)) { + if (JoinRelationSet::IsSubset(*left.relations, *filter.filter_info->left_relation_set) && + JoinRelationSet::IsSubset(*right.relations, *filter.filter_info->right_relation_set)) { return *left.numerator_relations; } return *right.numerator_relations; @@ -215,6 +212,7 @@ double CardinalityEstimator::CalculateUpdatedDenom(Subgraph2Denominator left, Su FilterInfoWithTotalDomains &filter) { double new_denom = left.denom * right.denom; switch (filter.filter_info->join_type) { + case JoinType::LEFT: case JoinType::INNER: { bool set = false; ExpressionType comparison_type = ExpressionType::COMPARE_EQUAL; @@ -262,8 +260,8 @@ double CardinalityEstimator::CalculateUpdatedDenom(Subgraph2Denominator left, Su } case JoinType::SEMI: case JoinType::ANTI: { - if (JoinRelationSet::IsSubset(*left.relations, *filter.filter_info->left_set) && - JoinRelationSet::IsSubset(*right.relations, *filter.filter_info->right_set)) { + if (JoinRelationSet::IsSubset(*left.relations, *filter.filter_info->left_relation_set) && + JoinRelationSet::IsSubset(*right.relations, *filter.filter_info->right_relation_set)) { new_denom = left.denom * CardinalityEstimator::DEFAULT_SEMI_ANTI_SELECTIVITY; return new_denom; } @@ -309,31 +307,43 @@ DenomInfo CardinalityEstimator::GetDenominator(JoinRelationSet &set) { // this helps cover a case where there are no subgraphs yet, and the only join filter is a SEMI JOIN auto left_subgraph = Subgraph2Denominator(); auto right_subgraph = Subgraph2Denominator(); - left_subgraph.relations = edge.filter_info->left_set; - left_subgraph.numerator_relations = edge.filter_info->left_set; - right_subgraph.relations = edge.filter_info->right_set; - right_subgraph.numerator_relations = edge.filter_info->right_set; + left_subgraph.relations = edge.filter_info->left_relation_set; + left_subgraph.numerator_relations = edge.filter_info->left_relation_set; + right_subgraph.relations = edge.filter_info->right_relation_set; + right_subgraph.numerator_relations = edge.filter_info->right_relation_set; left_subgraph.numerator_relations = &UpdateNumeratorRelations(left_subgraph, right_subgraph, edge); - left_subgraph.relations = edge.filter_info->set.get(); + if (edge.filter_info->join_type == JoinType::LEFT) { + auto denom = + edge.has_tdom_hll ? static_cast(edge.tdom_hll) : static_cast(edge.tdom_no_hll); + denom = MaxValue(denom, 1); + left_subgraph.numerator_relations_extra = 1 + LEFT_JOIN_COEFFICIENT * (denom - 1); + } + left_subgraph.relations = edge.filter_info->set; left_subgraph.denom = CalculateUpdatedDenom(left_subgraph, right_subgraph, edge); subgraphs.push_back(left_subgraph); } else if (subgraph_connections.size() == 1) { auto left_subgraph = &subgraphs.at(subgraph_connections.at(0)); auto right_subgraph = Subgraph2Denominator(); - right_subgraph.relations = edge.filter_info->right_set; - right_subgraph.numerator_relations = edge.filter_info->right_set; + right_subgraph.relations = edge.filter_info->right_relation_set; + right_subgraph.numerator_relations = edge.filter_info->right_relation_set; if (JoinRelationSet::IsSubset(*left_subgraph->relations, *right_subgraph.relations)) { - right_subgraph.relations = edge.filter_info->left_set; - right_subgraph.numerator_relations = edge.filter_info->left_set; + right_subgraph.relations = edge.filter_info->left_relation_set; + right_subgraph.numerator_relations = edge.filter_info->left_relation_set; } - if (JoinRelationSet::IsSubset(*left_subgraph->relations, *edge.filter_info->left_set) && - JoinRelationSet::IsSubset(*left_subgraph->relations, *edge.filter_info->right_set)) { + if (JoinRelationSet::IsSubset(*left_subgraph->relations, *edge.filter_info->left_relation_set) && + JoinRelationSet::IsSubset(*left_subgraph->relations, *edge.filter_info->right_relation_set)) { // here we have an edge that connects the same subgraph to the same subgraph. Just continue. no need to // update the denom continue; } left_subgraph->numerator_relations = &UpdateNumeratorRelations(*left_subgraph, right_subgraph, edge); + if (edge.filter_info->join_type == JoinType::LEFT) { + auto denom = + edge.has_tdom_hll ? static_cast(edge.tdom_hll) : static_cast(edge.tdom_no_hll); + denom = MaxValue(denom, 1); + left_subgraph->numerator_relations_extra = 1 + LEFT_JOIN_COEFFICIENT * (denom - 1); + } left_subgraph->relations = &set_manager.Union(*left_subgraph->relations, *right_subgraph.relations); left_subgraph->denom = CalculateUpdatedDenom(*left_subgraph, right_subgraph, edge); } else if (subgraph_connections.size() == 2) { @@ -346,6 +356,12 @@ DenomInfo CardinalityEstimator::GetDenominator(JoinRelationSet &set) { subgraph_to_merge_into->numerator_relations = &UpdateNumeratorRelations(*subgraph_to_merge_into, *subgraph_to_delete, edge); subgraph_to_merge_into->denom = CalculateUpdatedDenom(*subgraph_to_merge_into, *subgraph_to_delete, edge); + if (edge.filter_info->join_type == JoinType::LEFT) { + auto denom = + edge.has_tdom_hll ? static_cast(edge.tdom_hll) : static_cast(edge.tdom_no_hll); + D_ASSERT(denom >= 1); + subgraph_to_merge_into->numerator_relations_extra = 1 + LEFT_JOIN_COEFFICIENT * (denom - 1); + } subgraph_to_delete->relations = nullptr; auto remove_start = std::remove_if(subgraphs.begin(), subgraphs.end(), [](Subgraph2Denominator &s) { return !s.relations; }); @@ -366,6 +382,7 @@ DenomInfo CardinalityEstimator::GetDenominator(JoinRelationSet &set) { D_ASSERT(final_subgraph.numerator_relations && merge_with->numerator_relations); final_subgraph.numerator_relations = &set_manager.Union(*final_subgraph.numerator_relations, *merge_with->numerator_relations); + final_subgraph.numerator_relations_extra *= merge_with->numerator_relations_extra; final_subgraph.denom *= merge_with->denom; } } @@ -374,7 +391,9 @@ DenomInfo CardinalityEstimator::GetDenominator(JoinRelationSet &set) { // denominator is 1 and numerators are a cross product of cardinalities. return DenomInfo(set, 1, 1); } - return DenomInfo(*subgraphs.at(0).numerator_relations, 1, subgraphs.at(0).denom * denom_multiplier); + // auto filter_str + return DenomInfo(*subgraphs.at(0).numerator_relations, subgraphs.at(0).numerator_relations_extra, + subgraphs.at(0).denom * denom_multiplier); } template <> @@ -387,6 +406,7 @@ double CardinalityEstimator::EstimateCardinalityWithSet(JoinRelationSet &new_set // can happen if a table has cardinality 0, or a tdom is set to 0 auto denom = GetDenominator(new_set); auto numerator = GetNumerator(denom.numerator_relations); + numerator *= denom.extra_multiplier; double result = numerator / denom.denominator; auto new_entry = CardinalityHelper(result); diff --git a/src/optimizer/join_order/cost_model.cpp b/src/optimizer/join_order/cost_model.cpp index bfe64412f053..680e19293539 100644 --- a/src/optimizer/join_order/cost_model.cpp +++ b/src/optimizer/join_order/cost_model.cpp @@ -1,3 +1,4 @@ + #include "duckdb/optimizer/join_order/join_node.hpp" #include "duckdb/optimizer/join_order/join_order_optimizer.hpp" #include "duckdb/optimizer/join_order/cost_model.hpp" @@ -8,7 +9,11 @@ CostModel::CostModel(QueryGraphManager &query_graph_manager) : query_graph_manager(query_graph_manager), cardinality_estimator() { } -double CostModel::ComputeCost(DPJoinNode &left, DPJoinNode &right) { +double CostModel::ComputeCost(DPJoinNode &left, DPJoinNode &right, NeighborInfo &neighbor_info) { + return ComputeJoinCost(left, right); +} + +double CostModel::ComputeJoinCost(DPJoinNode &left, DPJoinNode &right) { auto &combination = query_graph_manager.set_manager.Union(left.set, right.set); auto join_card = cardinality_estimator.EstimateCardinalityWithSet(combination); auto join_cost = join_card; diff --git a/src/optimizer/join_order/join_order_optimizer.cpp b/src/optimizer/join_order/join_order_optimizer.cpp index e49798ac4bff..a84d4eadd3c1 100644 --- a/src/optimizer/join_order/join_order_optimizer.cpp +++ b/src/optimizer/join_order/join_order_optimizer.cpp @@ -7,6 +7,7 @@ #include "duckdb/optimizer/join_order/plan_enumerator.hpp" #include "duckdb/planner/expression/list.hpp" #include "duckdb/planner/operator/list.hpp" +#include "duckdb/optimizer/column_binding_replacer.hpp" namespace duckdb { @@ -20,16 +21,81 @@ JoinOrderOptimizer JoinOrderOptimizer::CreateChildOptimizer() { return child_optimizer; } +unique_ptr RemoveUnnecessaryProjections::RemoveProjectionsChildren(unique_ptr plan) { + for (idx_t i = 0; i < plan->children.size(); i++) { + plan->children[i] = RemoveProjections(std::move(plan->children[i])); + } + return plan; +} +unique_ptr RemoveUnnecessaryProjections::RemoveProjections(unique_ptr plan) { + if (plan->type == LogicalOperatorType::LOGICAL_UNION || plan->type == LogicalOperatorType::LOGICAL_EXCEPT || + plan->type == LogicalOperatorType::LOGICAL_INTERSECT || + plan->type == LogicalOperatorType::LOGICAL_RECURSIVE_CTE || + plan->type == LogicalOperatorType::LOGICAL_MATERIALIZED_CTE) { + // guaranteed to find a projection under this that is meant to keep the column order in the presence of + // an optimization done by build side probe side. + for (idx_t i = 0; i < plan->children.size(); i++) { + first_projection = true; + plan->children[i] = RemoveProjections(std::move(plan->children[i])); + } + return plan; + } + if (plan->type != LogicalOperatorType::LOGICAL_PROJECTION) { + return RemoveProjectionsChildren(std::move(plan)); + } + // operator is a projection. Remove if possible + if (first_projection) { + first_projection = false; + return RemoveProjectionsChildren(std::move(plan)); + } + auto &proj = plan->Cast(); + auto child_bindings = plan->children[0]->GetColumnBindings(); + if (proj.GetColumnBindings().size() != child_bindings.size()) { + return plan; + } + idx_t binding_index = 0; + for (auto &expr : proj.expressions) { + if (expr->type != ExpressionType::BOUND_COLUMN_REF) { + return plan; + } + auto &bound_ref = expr->Cast(); + if (bound_ref.binding != child_bindings[binding_index]) { + return plan; + } + binding_index++; + } + D_ASSERT(binding_index == plan->GetColumnBindings().size()); + // we have a projection where every expression is a bound column ref, and they are in the same order as the + // bindings of the child. We can remove this projection + binding_index = 0; + for (auto &binding : plan->GetColumnBindings()) { + replacer.replacement_bindings.push_back(ReplacementBinding(binding, child_bindings[binding_index])); + binding_index++; + } + return RemoveProjectionsChildren(std::move(plan->children[0])); +} + +RemoveUnnecessaryProjections::RemoveUnnecessaryProjections() { + first_projection = true; +} + unique_ptr JoinOrderOptimizer::Optimize(unique_ptr plan, - optional_ptr stats) { + optional_ptr stats, bool remove_projections) { // make sure query graph manager has not extracted a relation graph already + if (remove_projections) { + RemoveUnnecessaryProjections remover; + plan = remover.RemoveProjections(std::move(plan)); + remover.replacer.VisitOperator(*plan); + + auto bindings = plan->GetColumnBindings(); + } + LogicalOperator *op = plan.get(); // extract the relations that go into the hyper graph. // We optimize the children of any non-reorderable operations we come across. bool reorderable = query_graph_manager.Build(*this, *op); - // get relation_stats here since the reconstruction process will move all relations. auto relation_stats = query_graph_manager.relation_manager.GetRelationStats(); unique_ptr new_logical_plan = nullptr; diff --git a/src/optimizer/join_order/join_relation_set.cpp b/src/optimizer/join_order/join_relation_set.cpp index aa5767427ae7..11f4a3a708af 100644 --- a/src/optimizer/join_order/join_relation_set.cpp +++ b/src/optimizer/join_order/join_relation_set.cpp @@ -66,6 +66,14 @@ JoinRelationSet &JoinRelationSetManager::GetJoinRelation(idx_t index) { return GetJoinRelation(std::move(relations), count); } +JoinRelationSet &JoinRelationSetManager::GetEmptyJoinRelationSet() { + if (!empty_relation_set) { + const unordered_set empty_bindings = {}; + empty_relation_set = GetJoinRelation(empty_bindings); + } + return *empty_relation_set.get(); +} + JoinRelationSet &JoinRelationSetManager::GetJoinRelation(const unordered_set &bindings) { // create a sorted vector of the relations unsafe_unique_array relations = bindings.empty() ? nullptr : make_unsafe_uniq_array(bindings.size()); diff --git a/src/optimizer/join_order/plan_enumerator.cpp b/src/optimizer/join_order/plan_enumerator.cpp index 04b396b97800..19d715e1ede9 100644 --- a/src/optimizer/join_order/plan_enumerator.cpp +++ b/src/optimizer/join_order/plan_enumerator.cpp @@ -101,7 +101,6 @@ const reference_map_t> &PlanEnumerator:: unique_ptr PlanEnumerator::CreateJoinTree(JoinRelationSet &set, const vector> &possible_connections, DPJoinNode &left, DPJoinNode &right) { - // FIXME: should consider different join algorithms, should we pick a join algorithm here as well? (probably) optional_ptr best_connection = possible_connections.back().get(); // cross products are technically still connections, but the filter expression is a null_ptr @@ -120,7 +119,7 @@ unique_ptr PlanEnumerator::CreateJoinTree(JoinRelationSet &set, } auto join_type = JoinType::INVALID; for (auto &filter_binding : best_connection->filters) { - if (!filter_binding->left_set || !filter_binding->right_set) { + if (!filter_binding->left_relation_set || !filter_binding->right_relation_set) { continue; } @@ -132,7 +131,7 @@ unique_ptr PlanEnumerator::CreateJoinTree(JoinRelationSet &set, } } // need the filter info from the Neighborhood info. - auto cost = cost_model.ComputeCost(left, right); + auto cost = cost_model.ComputeCost(left, right, *best_connection); auto result = make_uniq(set, best_connection, left.set, right.set, cost); result->cardinality = cost_model.cardinality_estimator.EstimateCardinalityWithSet(set); return result; diff --git a/src/optimizer/join_order/query_graph_manager.cpp b/src/optimizer/join_order/query_graph_manager.cpp index 6850508a6997..e061e0326201 100644 --- a/src/optimizer/join_order/query_graph_manager.cpp +++ b/src/optimizer/join_order/query_graph_manager.cpp @@ -29,41 +29,16 @@ bool QueryGraphManager::Build(JoinOrderOptimizer &optimizer, LogicalOperator &op return false; } // extract the edges of the hypergraph, creating a list of filters and their associated bindings. - filters_and_bindings = relation_manager.ExtractEdges(op, filter_operators, set_manager); + filters_and_bindings = relation_manager.ExtractEdges(filter_operators, set_manager); // Create the query_graph hyper edges CreateHyperGraphEdges(); return true; } -void QueryGraphManager::GetColumnBinding(Expression &expression, ColumnBinding &binding) { - if (expression.GetExpressionType() == ExpressionType::BOUND_COLUMN_REF) { - // Here you have a filter on a single column in a table. Return a binding for the column - // being filtered on so the filter estimator knows what HLL count to pull - auto &colref = expression.Cast(); - D_ASSERT(colref.depth == 0); - D_ASSERT(colref.binding.table_index != DConstants::INVALID_INDEX); - // map the base table index to the relation index used by the JoinOrderOptimizer - D_ASSERT(relation_manager.relation_mapping.find(colref.binding.table_index) != - relation_manager.relation_mapping.end()); - binding = - ColumnBinding(relation_manager.relation_mapping[colref.binding.table_index], colref.binding.column_index); - } - // TODO: handle inequality filters with functions. - ExpressionIterator::EnumerateChildren(expression, [&](Expression &expr) { GetColumnBinding(expr, binding); }); -} - const vector> &QueryGraphManager::GetFilterBindings() const { return filters_and_bindings; } -void FilterInfo::SetLeftSet(optional_ptr left_set_new) { - left_set = left_set_new; -} - -void FilterInfo::SetRightSet(optional_ptr right_set_new) { - right_set = right_set_new; -} - static unique_ptr PushFilter(unique_ptr node, unique_ptr expr) { // push an expression into a filter // first check if we have any filter to push it into @@ -83,76 +58,12 @@ static unique_ptr PushFilter(unique_ptr node, void QueryGraphManager::CreateHyperGraphEdges() { // create potential edges from the comparisons for (auto &filter_info : filters_and_bindings) { - auto &filter = filter_info->filter; - // now check if it can be used as a join predicate - if (filter->GetExpressionClass() == ExpressionClass::BOUND_COMPARISON) { - auto &comparison = filter->Cast(); - // extract the bindings that are required for the left and right side of the comparison - unordered_set left_bindings, right_bindings; - relation_manager.ExtractBindings(*comparison.left, left_bindings); - relation_manager.ExtractBindings(*comparison.right, right_bindings); - GetColumnBinding(*comparison.left, filter_info->left_binding); - GetColumnBinding(*comparison.right, filter_info->right_binding); - if (!left_bindings.empty() && !right_bindings.empty()) { - // both the left and the right side have bindings - // first create the relation sets, if they do not exist - if (!filter_info->left_set) { - filter_info->left_set = &set_manager.GetJoinRelation(left_bindings); - } - if (!filter_info->right_set) { - filter_info->right_set = &set_manager.GetJoinRelation(right_bindings); - } - // we can only create a meaningful edge if the sets are not exactly the same - if (filter_info->left_set != filter_info->right_set) { - // check if the sets are disjoint - if (Disjoint(left_bindings, right_bindings)) { - // they are disjoint, we only need to create one set of edges in the join graph - query_graph.CreateEdge(*filter_info->left_set, *filter_info->right_set, filter_info); - query_graph.CreateEdge(*filter_info->right_set, *filter_info->left_set, filter_info); - } - } - } - } else if (filter->GetExpressionClass() == ExpressionClass::BOUND_CONJUNCTION) { - auto &conjunction = filter->Cast(); - if (conjunction.GetExpressionType() == ExpressionType::CONJUNCTION_OR || - filter_info->join_type == JoinType::INNER || filter_info->join_type == JoinType::INVALID) { - // Currently we do not interpret Conjunction expressions as INNER joins - // for hyper graph edges. These are most likely OR conjunctions, and - // will be pushed down into a join later in the optimizer. - // Conjunction filters are mostly to help plan semi and anti joins at the moment. - continue; - } - unordered_set left_bindings, right_bindings; - D_ASSERT(filter_info->left_set); - D_ASSERT(filter_info->right_set); - D_ASSERT(filter_info->join_type == JoinType::SEMI || filter_info->join_type == JoinType::ANTI); - for (auto &child_comp : conjunction.children) { - if (child_comp->GetExpressionClass() != ExpressionClass::BOUND_COMPARISON) { - continue; - } - auto &comparison = child_comp->Cast(); - // extract the bindings that are required for the left and right side of the comparison - relation_manager.ExtractBindings(*comparison.left, left_bindings); - relation_manager.ExtractBindings(*comparison.right, right_bindings); - if (filter_info->left_binding.table_index == DConstants::INVALID_INDEX && - filter_info->left_binding.column_index == DConstants::INVALID_INDEX) { - GetColumnBinding(*comparison.left, filter_info->left_binding); - } - if (filter_info->right_binding.table_index == DConstants::INVALID_INDEX && - filter_info->right_binding.column_index == DConstants::INVALID_INDEX) { - GetColumnBinding(*comparison.right, filter_info->right_binding); - } - } - if (!left_bindings.empty() && !right_bindings.empty()) { - // we can only create a meaningful edge if the sets are not exactly the same - if (filter_info->left_set != filter_info->right_set) { - // check if the sets are disjoint - if (Disjoint(left_bindings, right_bindings)) { - // they are disjoint, we only need to create one set of edges in the join graph - query_graph.CreateEdge(*filter_info->left_set, *filter_info->right_set, filter_info); - query_graph.CreateEdge(*filter_info->right_set, *filter_info->left_set, filter_info); - } - } + if (!filter_info->left_relation_set->Empty() && !filter_info->right_relation_set->Empty()) { + // we can only create a meaningful edge if the sets are not exactly the same + if (filter_info->left_relation_set != filter_info->right_relation_set) { + // they are disjoint, we only need to create one set of edges in the join graph + query_graph.CreateEdge(*filter_info->left_relation_set, *filter_info->right_relation_set, filter_info); + query_graph.CreateEdge(*filter_info->right_relation_set, *filter_info->left_relation_set, filter_info); } } } @@ -234,6 +145,22 @@ static JoinCondition MaybeInvertConditions(unique_ptr condition, boo return cond; } +void GetColumnBindingsFromExpression(Expression &expression, column_binding_set_t &column_bindings) { + if (expression.GetExpressionType() == ExpressionType::BOUND_COLUMN_REF) { + // Here you have a filter on a single column in a table. Return a binding for the column + // being filtered on so the filter estimator knows what HLL count to pull + auto &colref = expression.Cast(); + D_ASSERT(colref.depth == 0); + D_ASSERT(colref.binding.table_index != DConstants::INVALID_INDEX); + // only add column bindings that map to relations. + // map the base table index to the relation index used by the JoinOrderOptimizer + column_bindings.insert(ColumnBinding(colref.binding.table_index, colref.binding.column_index)); + } + // TODO: handle inequality filters with functions. + ExpressionIterator::EnumerateChildren( + expression, [&](Expression &expr) { GetColumnBindingsFromExpression(expr, column_bindings); }); +} + GenerateJoinRelation QueryGraphManager::GenerateJoins(vector> &extracted_relations, JoinRelationSet &set) { optional_ptr left_node; @@ -267,8 +194,6 @@ GenerateJoinRelation QueryGraphManager::GenerateJoins(vector(chosen_filter->join_type); - // Here we optimize build side probe side. Our build side is the right side - // So the right plans should have lower cardinalities. join->children.push_back(std::move(left.op)); join->children.push_back(std::move(right.op)); @@ -280,15 +205,20 @@ GenerateJoinRelation QueryGraphManager::GenerateJoins(vectorfilter_index); auto condition = std::move(filter_and_binding->filter); // now create the actual join condition - D_ASSERT((JoinRelationSet::IsSubset(*left.set, *f->left_set) && - JoinRelationSet::IsSubset(*right.set, *f->right_set)) || - (JoinRelationSet::IsSubset(*left.set, *f->right_set) && - JoinRelationSet::IsSubset(*right.set, *f->left_set))); + D_ASSERT((JoinRelationSet::IsSubset(*left.set, *f->left_relation_set) && + JoinRelationSet::IsSubset(*right.set, *f->right_relation_set)) || + (JoinRelationSet::IsSubset(*left.set, *f->right_relation_set) && + JoinRelationSet::IsSubset(*right.set, *f->left_relation_set))); + + auto left_bindings = join->children[0]->GetColumnBindings(); + auto right_bindings = join->children[1]->GetColumnBindings(); + bool invert = !JoinRelationSet::IsSubset(*left.set, *f->left_relation_set); - bool invert = !JoinRelationSet::IsSubset(*left.set, *f->left_set); - // If the left and right set are inverted AND it is a semi or anti join - // swap left and right children back. - if (invert && (f->join_type == JoinType::SEMI || f->join_type == JoinType::ANTI)) { + // If the left and right set are inverted for LEFT/SEMI/ANTI joins then swap them back + // and set invert = false. This is to preserve left/rightedness of relations + if (invert && (f->join_type == JoinType::LEFT || f->join_type == JoinType::SEMI || + f->join_type == JoinType::ANTI)) { + std::swap(join->children[0], join->children[1]); std::swap(left, right); invert = false; } @@ -333,13 +263,18 @@ GenerateJoinRelation QueryGraphManager::GenerateJoins(vectorfilter) { // now check if the filter is a subset of the current relation // note that infos with an empty relation set are a special case and we do not push them down - if (info.set.get().count > 0 && JoinRelationSet::IsSubset(*result_relation, info.set)) { + if (info.join_type == JoinType::LEFT) { + // any left join is most definitely a filter that joins two relations, so do not push the filter + // preemptively here + continue; + } + if (info.set->count > 0 && JoinRelationSet::IsSubset(*result_relation, *info.set)) { auto &filter_and_binding = filters_and_bindings[info.filter_index]; auto filter = std::move(filter_and_binding->filter); // if it is, we can push the filter // we can push it either into a join or as a filter // check if we are in a join or in a base table - if (!left_node || !info.left_set) { + if (!left_node || !info.left_relation_set) { // base table or non-comparison expression, push it as a filter result_operator = PushFilter(std::move(result_operator), std::move(filter)); continue; @@ -348,11 +283,11 @@ GenerateJoinRelation QueryGraphManager::GenerateJoins(vector + namespace duckdb { const vector RelationManager::GetRelationStats() { @@ -28,6 +31,18 @@ idx_t RelationManager::NumRelations() { return relations.size(); } +void FilterInfo::SetLeftSet(optional_ptr left_set_new) { + left_relation_set = left_set_new; +} + +void FilterInfo::SetRightSet(optional_ptr right_set_new) { + right_relation_set = right_set_new; +} + +bool FilterInfo::SingleColumnFilter() { + return left_relation_set->Empty() || right_relation_set->Empty(); +} + void RelationManager::AddAggregateOrWindowRelation(LogicalOperator &op, optional_ptr parent, const RelationStats &stats, LogicalOperatorType op_type) { auto relation = make_uniq(op, parent, stats); @@ -65,19 +80,13 @@ void RelationManager::AddRelation(LogicalOperator &op, optional_ptrchildren.size() == 1) { - if (OperatorNeedsRelation(tmp->type) || OperatorIsNonReorderable(tmp->type)) { + if (OperatorNeedsRelation(*tmp) || OperatorIsNonReorderable(tmp->type)) { return true; } tmp = tmp->children[0].get(); @@ -192,7 +202,7 @@ bool RelationManager::ExtractJoinRelations(JoinOrderOptimizer &optimizer, Logica vector> datasource_filters; optional_ptr limit_op = nullptr; // pass through single child operators - while (op->children.size() == 1 && !OperatorNeedsRelation(op->type)) { + while (op->children.size() == 1 && !OperatorNeedsRelation(*op)) { if (op->type == LogicalOperatorType::LOGICAL_FILTER) { if (HasNonReorderableChild(*op)) { datasource_filters.push_back(*op); @@ -298,24 +308,25 @@ bool RelationManager::ExtractJoinRelations(JoinOrderOptimizer &optimizer, Logica // Adding relations of the left side to the current join order optimizer bool can_reorder_left = ExtractJoinRelations(optimizer, *op->children[0], filter_operators, op); bool can_reorder_right = true; - // For semi & anti joins, you only reorder relations in the left side of the join. - // We do not want to reorder a relation A into the right side because then all column bindings A from A will be - // lost after the semi or anti join - + // For semi/anti/left joins, you can only reorder relations through the left side of the join + // SEMI/ANTI: If we reorder a relation A into the right side then all column bindings from A + // will be lost after the semi or anti join // We cannot reorder a relation B out of the right side because any filter/join in the right side - // between a relation B and another RHS relation will be invalid. The semi join will remove - // all right column bindings, + // between a relation B and another RHS relation will be invalid. + // LEFT JOINS: If you push a relation A into the RHS of a left join, all LHS tuples of the left join + // are propageted up + // If you pull a relation A out of the RHS of a left join, you filter the LHS of the left join too strictly. // So we treat the right side of left join as its own relation so no relations // are pushed into the right side, or taken out of the right side. - if (join.join_type == JoinType::SEMI || join.join_type == JoinType::ANTI) { + if (join.join_type == JoinType::SEMI || join.join_type == JoinType::ANTI || join.join_type == JoinType::LEFT) { RelationStats child_stats; // optimize the child and copy the stats auto child_optimizer = optimizer.CreateChildOptimizer(); op->children[1] = child_optimizer.Optimize(std::move(op->children[1]), &child_stats); AddRelation(*op->children[1], op, child_stats); // remember that if a cross product needs to be forced, it cannot be forced - // across the children of a semi or anti join + // across the children of a semi/anti/left join no_cross_product_relations.insert(relations.size() - 1); auto right_child_bindings = op->children[1]->GetColumnBindings(); for (auto &bindings : right_child_bindings) { @@ -445,7 +456,38 @@ bool RelationManager::ExtractJoinRelations(JoinOrderOptimizer &optimizer, Logica } } -bool RelationManager::ExtractBindings(Expression &expression, unordered_set &bindings) { +void RelationManager::GetColumnBindingsFromExpression(Expression &expression, column_binding_set_t &column_bindings) { + if (expression.GetExpressionType() == ExpressionType::BOUND_COLUMN_REF) { + // Here you have a filter on a single column in a table. Return a binding for the column + // being filtered on so the filter estimator knows what HLL count to pull + auto &colref = expression.Cast(); + D_ASSERT(colref.depth == 0); + D_ASSERT(colref.binding.table_index != DConstants::INVALID_INDEX); + // only add column bindings that map to relations. + if (relation_mapping.find(colref.binding.table_index) == relation_mapping.end()) { + return; + } + // map the base table index to the relation index used by the JoinOrderOptimizer + column_bindings.insert( + ColumnBinding(relation_mapping[colref.binding.table_index], colref.binding.column_index)); + } + + // TODO: handle inequality filters with functions. + ExpressionIterator::EnumerateChildren( + expression, [&](Expression &expr) { GetColumnBindingsFromExpression(expr, column_bindings); }); +} + +optional_ptr RelationManager::GetJoinRelations(column_binding_set_t &column_bindings, + JoinRelationSetManager &set_manager) { + optional_ptr ret = set_manager.GetEmptyJoinRelationSet(); + for (auto &binding : column_bindings) { + optional_ptr binding_set = set_manager.GetJoinRelation(binding.table_index); + ret = set_manager.Union(*ret, *binding_set); + } + return *ret; +} + +void RelationManager::ExtractColumnBindingsFromExpression(Expression &expression, unordered_set &bindings) { if (expression.GetExpressionType() == ExpressionType::BOUND_COLUMN_REF) { auto &colref = expression.Cast(); D_ASSERT(colref.depth == 0); @@ -457,7 +499,7 @@ bool RelationManager::ExtractBindings(Expression &expression, unordered_set> +RelationManager::CreateFilterFromConjunctionChildren(unique_ptr conjunction_expression, + JoinRelationSetManager &set_manager, JoinType join_type) { + column_binding_set_t left_bindings, right_bindings; + vector> leftover_expressions; + unique_ptr filter_info = nullptr; + // gather all relations/bindings from left expressions and all relations/bindings from the right expression sides + // this encapsulates filters like (t1.a = t2.b OR t1.c = t2.d) to be a join condition + // also LEFT/SEMI/ANTI joins with multiple conditions. + for (auto &bound_expr : conjunction_expression->children) { + if (bound_expr->GetExpressionClass() == ExpressionClass::BOUND_COMPARISON) { + auto &comp = bound_expr->Cast(); + GetColumnBindingsFromExpression(*comp.left, left_bindings); + GetColumnBindingsFromExpression(*comp.right, right_bindings); + } else { + // if the condition is (t1.a = t2.b OR t1.c = t2.d or t1 is not null) + // then you need t1 and t2 in the whole relation set, so add t1 and t2 to both side. + // TODO: no you don't. But it does need to be in the left or the right. + GetColumnBindingsFromExpression(*bound_expr, left_bindings); + GetColumnBindingsFromExpression(*bound_expr, right_bindings); + } + } + if (left_bindings.empty() && right_bindings.empty()) { + // the conjunction filter cannot be made into a connection + // in this case we do not create a FilterInfo for it, it will be pushed down the plan + // during plan reconstruction. (duckdb-internal/#1493)s + leftover_expressions.push_back(std::move(conjunction_expression)); + return leftover_expressions; + } + auto left_relations = GetJoinRelations(left_bindings, set_manager); + auto right_relations = GetJoinRelations(right_bindings, set_manager); + optional_ptr all_relations = set_manager.Union(*left_relations, *right_relations); + D_ASSERT(left_relations && right_relations && all_relations && conjunction_expression); + if (left_relations->Empty() || right_relations->Empty()) { + filter_info = make_uniq(std::move(conjunction_expression), all_relations.get(), + filter_infos_.size(), join_type, *left_relations, *right_relations); + } else { + filter_info = make_uniq(std::move(conjunction_expression), all_relations.get(), + filter_infos_.size(), join_type, *left_relations, *right_relations, + *left_bindings.begin(), *right_bindings.begin()); + } + filter_infos_.push_back(std::move(filter_info)); + return leftover_expressions; +} + +vector> RelationManager::CreateFilterInfoFromExpression(unique_ptr expr, + JoinRelationSetManager &set_manager, + JoinType join_type) { + // Given a filter expression operator, check the following + // if join_type == JoinType::LEFT, JoinType::ANTI, JoinType::SEMI, + // -> treat expression as conjunction OR so and conditions don't get split up. + // if conjunction AND - > recurse on each child + // if conjunction OR -> extract bindings from left and right sides and create FilterInfo + // if comparison expression -> extract relations from left and right. If both sides have a RelationSet, create + // filter info, otherwise create a "leftover_expression" else -> create a "leftover expression" + vector> leftover_expressions; + unique_ptr new_filter = nullptr; + column_binding_set_t left_bindings, right_bindings; + optional_ptr left_set = nullptr; + optional_ptr right_set = nullptr; + optional_ptr set = nullptr; + unique_ptr new_expression = nullptr; + switch (join_type) { + case JoinType::SEMI: + case JoinType::ANTI: + case JoinType::LEFT: { + // todo handle a case like select * from a join b on a.col1 = a.col2 (i.e no conditions no table b) + // for SEMI ANTI AND LEFT, you want to keep the expressions together + D_ASSERT(expr->expression_class == ExpressionClass::BOUND_CONJUNCTION); + if (expr->expression_class == ExpressionClass::BOUND_CONJUNCTION) { + auto conj = unique_ptr_cast(std::move(expr)); + auto unused_expressions = CreateFilterFromConjunctionChildren(std::move(conj), set_manager, join_type); + // there should not be any unused expressions here. + D_ASSERT(unused_expressions.empty()); + D_ASSERT(!filter_infos_.empty()); + // We can guarantee there is a filterr info since the filter is created from a semi anti condition. + auto &new_filter = *filter_infos_.back(); + left_set = new_filter.left_relation_set; + right_set = new_filter.right_relation_set; + set = set_manager.Union(*left_set, *right_set); + } else { + throw InternalException("left/semi/anti join should have conjunction expression"); + } + + if (join_type == JoinType::LEFT) { + // When we extract relations from the left join, all filters already extracted (i.e above the left + // join) must be checked for the following condition If the filter includes relations from the RHS + // of the LEFT JOIN, then all LHS relations of the LEFT JOIN are required to be present before the + // filter can take place. This means the left join will be planned before the filter. + for (auto &filter : filter_infos_) { + if (filter->join_type == JoinType::LEFT) { + // don't inspect the filter we just created. + continue; + } + // if any filter filters on just the right set, + // if there is no right set, it is a single column filter? + if (JoinRelationSet::IsSubset(*filter->set, *right_set)) { + // TODO: I don't think changing the set does much, you need to change the left and right set and + // the conditions of what is required. + // make sure it requires all relations from the left set. + // if the filter is a (T1.a = 9) where t1.a is in the RHS of the left join + // then the filter set needs the left relations of the left join filter + // if the filter is a (T1.a = T2.b) where T1.a is the RHS of the left join and t2.b is the + // LHS, then we are fine. Union the two sets because if you have a join plan like so ((A + // LEFT JOIN B) JOIN C) with condition B.x = C.y the upper inner join has the total set (A, + // B, C). + filter->set = set_manager.Union(*filter->set, *set); + } + if (JoinRelationSet::IsSubset(*filter->left_relation_set, *right_set)) { + filter->left_relation_set = set_manager.Union(*filter->left_relation_set, *set); + } + if (JoinRelationSet::IsSubset(*filter->right_relation_set, *right_set)) { + filter->right_relation_set = set_manager.Union(*filter->right_relation_set, *set); + } + } + } + // there really should be no leftover expressions + D_ASSERT(leftover_expressions.empty()); + break; + } + // for inner joins, the filter condition can come from regular filter operation + // so we want to extract each expressions individually (if it's a conjunction and) + case JoinType::INNER: { + if (expr->expression_class == ExpressionClass::BOUND_CONJUNCTION) { + auto conj = unique_ptr_cast(std::move(expr)); + if (conj->type == ExpressionType::CONJUNCTION_AND) { + // recurse into conjunction children and try to make join filter from each child. + for (idx_t i = 0; i < conj->children.size(); i++) { + auto child = std::move(conj->children[i]); + auto unused_expressions = CreateFilterInfoFromExpression(std::move(child), set_manager, join_type); + for (idx_t j = 0; j < unused_expressions.size(); j++) { + auto unused_expression = std::move(unused_expressions[j]); + if (unused_expression) { + leftover_expressions.push_back(std::move(unused_expression)); + } + } + } + } else { + auto unused_expressions = CreateFilterFromConjunctionChildren(std::move(conj), set_manager, join_type); + for (idx_t j = 0; j < unused_expressions.size(); j++) { + auto unused_expression = std::move(unused_expressions[j]); + if (unused_expression) { + leftover_expressions.push_back(std::move(unused_expression)); + } + } + return leftover_expressions; + } + } else if (expr->expression_class == ExpressionClass::BOUND_COMPARISON) { + auto &comp = expr->Cast(); + auto new_comp = + make_uniq(comp.type, std::move(comp.left), std::move(comp.right)); + GetColumnBindingsFromExpression(*new_comp->left, left_bindings); + GetColumnBindingsFromExpression(*new_comp->right, right_bindings); + left_set = GetJoinRelations(left_bindings, set_manager); + right_set = GetJoinRelations(right_bindings, set_manager); + set = set_manager.Union(*left_set, *right_set); + new_expression = unique_ptr_cast(std::move(new_comp)); + } else { + // filter is something like `t1.a is null` or `not t1.a` + // or t1.a IN (1, 4, 9) + GetColumnBindingsFromExpression(*expr, left_bindings); + left_set = GetJoinRelations(left_bindings, set_manager); + right_set = GetJoinRelations(right_bindings, set_manager); + set = set_manager.Union(*left_set, *right_set); + new_expression = std::move(expr); + } + break; + } + default: + throw InternalException("Unknown join type"); + } + + if (left_bindings.empty() && right_bindings.empty()) { + // the filter cannot be made into a connection + // in this case we do not create a FilterInfo for it, it will be pushed down the plan + // during plan reconstruction. (duckdb-internal/#1493)s + if (new_expression) { + leftover_expressions.push_back(std::move(new_expression)); + } + } else if (new_expression) { + D_ASSERT(new_expression); + D_ASSERT(set && left_set && right_set); + if (left_set->Empty() || right_set->Empty()) { + new_filter = make_uniq(std::move(new_expression), set.get(), filter_infos_.size(), join_type, + *left_set, *right_set); + } else { + new_filter = make_uniq(std::move(new_expression), set.get(), filter_infos_.size(), join_type, + *left_set, *right_set, *left_bindings.begin(), *right_bindings.begin()); } - }); - return can_reorder; + filter_infos_.push_back(std::move(new_filter)); + } + return leftover_expressions; } -vector> RelationManager::ExtractEdges(LogicalOperator &op, - vector> &filter_operators, +vector> RelationManager::ExtractEdges(vector> &filter_operators, JoinRelationSetManager &set_manager) { + D_ASSERT(filter_infos_.empty()); // now that we know we are going to perform join ordering we actually extract the filters, eliminating duplicate // filters in the process - vector> filters_and_bindings; expression_set_t filter_set; for (auto &filter_op : filter_operators) { auto &f_op = filter_op.get(); @@ -492,110 +721,64 @@ vector> RelationManager::ExtractEdges(LogicalOperator &op f_op.type == LogicalOperatorType::LOGICAL_ASOF_JOIN) { auto &join = f_op.Cast(); D_ASSERT(join.expressions.empty()); - if (join.join_type == JoinType::SEMI || join.join_type == JoinType::ANTI) { - - auto conjunction_expression = make_uniq(ExpressionType::CONJUNCTION_AND); - // create a conjunction expression for the semi join. - // It's possible multiple LHS relations have a condition in - // this semi join. Suppose we have ((A ⨝ B) ⋉ C). (example in test_4950.test) - // If the semi join condition has A.x = C.y AND B.x = C.z then we need to prevent a reordering - // that looks like ((A ⋉ C) ⨝ B)), since all columns from C will be lost after it joins with A, - // and the condition B.x = C.z will no longer be possible. - // if we make a conjunction expressions and populate the left set and right set with all - // the relations from the conditions in the conjunction expression, we can prevent invalid - // reordering. - for (auto &cond : join.conditions) { - auto comparison = make_uniq(cond.comparison, std::move(cond.left), - std::move(cond.right)); - conjunction_expression->children.push_back(std::move(comparison)); - } - - // create the filter info so all required LHS relations are present when reconstructing the - // join - optional_ptr left_set; - optional_ptr right_set; - optional_ptr full_set; - // here we create a left_set that unions all relations from the left side of - // every expression and a right_set that unions all relations frmo the right side of a - // every expression (although this should always be 1). - for (auto &bound_expr : conjunction_expression->children) { - D_ASSERT(bound_expr->GetExpressionClass() == ExpressionClass::BOUND_COMPARISON); - auto &comp = bound_expr->Cast(); - unordered_set right_bindings, left_bindings; - ExtractBindings(*comp.right, right_bindings); - ExtractBindings(*comp.left, left_bindings); - - if (!left_set) { - left_set = set_manager.GetJoinRelation(left_bindings); - } else { - left_set = set_manager.Union(set_manager.GetJoinRelation(left_bindings), *left_set); - } - if (!right_set) { - right_set = set_manager.GetJoinRelation(right_bindings); - } else { - right_set = set_manager.Union(set_manager.GetJoinRelation(right_bindings), *right_set); - } + switch (join.join_type) { + case JoinType::SEMI: + case JoinType::ANTI: + case JoinType::LEFT: { + // create a filter info that is a conjunction of all conditions, you cannot split up + // conditions for these join types. + unique_ptr conj_expr = + make_uniq(ExpressionType::CONJUNCTION_AND); + for (idx_t i = 0; i < join.conditions.size(); i++) { + auto &condition = join.conditions[i]; + auto expr = make_uniq(condition.comparison, std::move(condition.left), + std::move(condition.right)); + conj_expr->children.push_back(std::move(expr)); } - full_set = set_manager.Union(*left_set, *right_set); - D_ASSERT(left_set && left_set->count > 0); - D_ASSERT(right_set && right_set->count == 1); - D_ASSERT(full_set && full_set->count > 0); - - // now we push the conjunction expressions - // In QueryGraphManager::GenerateJoins we extract each condition again and create a standalone join - // condition. - auto filter_info = make_uniq(std::move(conjunction_expression), *full_set, - filters_and_bindings.size(), join.join_type); - filter_info->SetLeftSet(left_set); - filter_info->SetRightSet(right_set); - - filters_and_bindings.push_back(std::move(filter_info)); - } else { - // can extract every inner join condition individually. - for (auto &cond : join.conditions) { - auto comparison = make_uniq(cond.comparison, std::move(cond.left), - std::move(cond.right)); - if (filter_set.find(*comparison) == filter_set.end()) { - filter_set.insert(*comparison); - unordered_set bindings; - ExtractBindings(*comparison, bindings); - auto &set = set_manager.GetJoinRelation(bindings); - auto filter_info = make_uniq(std::move(comparison), set, - filters_and_bindings.size(), join.join_type); - filters_and_bindings.push_back(std::move(filter_info)); - } + auto leftover_exprs = CreateFilterInfoFromExpression(std::move(conj_expr), set_manager, join.join_type); + D_ASSERT(leftover_exprs.empty()); + break; + } + default: { + D_ASSERT(join.join_type == JoinType::INNER); + for (idx_t i = 0; i < join.conditions.size(); i++) { + auto &condition = join.conditions[i]; + auto expr = make_uniq(condition.comparison, std::move(condition.left), + std::move(condition.right)); + auto leftover_exprs = CreateFilterInfoFromExpression(std::move(expr), set_manager, join.join_type); + // since this is a join, there should not be leftover expressions + // TODO: handle a case like select * from t1 JOIN t2 on t1.a = t1.b; (i.e filter is only on t1) + D_ASSERT(leftover_exprs.empty()); } + break; + } } join.conditions.clear(); } else { + // handle filters from logical filters vector> leftover_expressions; - for (auto &expression : f_op.expressions) { - if (filter_set.find(*expression) == filter_set.end()) { - filter_set.insert(*expression); - unordered_set bindings; - ExtractBindings(*expression, bindings); - if (bindings.empty()) { - // the filter is on a column that is not in our relational map. (example: limit_rownum) - // in this case we do not create a FilterInfo for it. (duckdb-internal/#1493)s - leftover_expressions.push_back(std::move(expression)); - continue; - } - auto &set = set_manager.GetJoinRelation(bindings); - auto filter_info = make_uniq(std::move(expression), set, filters_and_bindings.size()); - filters_and_bindings.push_back(std::move(filter_info)); + for (idx_t i = 0; i < f_op.expressions.size(); i++) { + auto expression = std::move(f_op.expressions[i]); + auto l = CreateFilterInfoFromExpression(std::move(expression), set_manager, JoinType::INNER); + for (idx_t j = 0; j < l.size(); j++) { + auto expr = std::move(l[j]); + leftover_expressions.push_back(std::move(expr)); } } f_op.expressions = std::move(leftover_expressions); } } - - return filters_and_bindings; +#ifdef DEBUG // otherwise tidy thinks &filter is unused. + for (auto &filter : filter_infos_) { + D_ASSERT(!filter->set->Empty()); + } +#endif + return std::move(filter_infos_); } // LCOV_EXCL_START void RelationManager::PrintRelationStats() { -#ifdef DEBUG string to_print; for (idx_t i = 0; i < relations.size(); i++) { auto &relation = relations.at(i); @@ -610,7 +793,6 @@ void RelationManager::PrintRelationStats() { to_print += " and relation id " + to_string(i) + "\n"; Printer::Print(to_print); } -#endif } // LCOV_EXCL_STOP diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index 8ac4cdd87da8..e4659082e646 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -103,6 +103,8 @@ void Optimizer::RunBuiltInOptimizers() { case LogicalOperatorType::LOGICAL_TRANSACTION: case LogicalOperatorType::LOGICAL_PRAGMA: case LogicalOperatorType::LOGICAL_SET: + case LogicalOperatorType::LOGICAL_CREATE_TABLE: + case LogicalOperatorType::LOGICAL_INSERT: case LogicalOperatorType::LOGICAL_UPDATE_EXTENSIONS: case LogicalOperatorType::LOGICAL_CREATE_SECRET: case LogicalOperatorType::LOGICAL_EXTENSION_OPERATOR: @@ -170,7 +172,7 @@ void Optimizer::RunBuiltInOptimizers() { // this also rewrites cross products + filters into joins and performs filter pushdowns RunOptimizer(OptimizerType::JOIN_ORDER, [&]() { JoinOrderOptimizer optimizer(context); - plan = optimizer.Optimize(std::move(plan)); + plan = optimizer.Optimize(std::move(plan), nullptr, true); }); // rewrites UNNESTs in DelimJoins by moving them to the projection diff --git a/src/planner/binder/tableref/plan_subqueryref.cpp b/src/planner/binder/tableref/plan_subqueryref.cpp index 821654460ab5..3e2f2c3aef15 100644 --- a/src/planner/binder/tableref/plan_subqueryref.cpp +++ b/src/planner/binder/tableref/plan_subqueryref.cpp @@ -1,5 +1,6 @@ #include "duckdb/planner/binder.hpp" #include "duckdb/planner/tableref/bound_subqueryref.hpp" +#include "duckdb/planner/expression/bound_reference_expression.hpp" namespace duckdb { diff --git a/test/optimizer/joins/left_join_reordering/left_join_cardinality_estimation.test b/test/optimizer/joins/left_join_reordering/left_join_cardinality_estimation.test new file mode 100644 index 000000000000..75cab77da016 --- /dev/null +++ b/test/optimizer/joins/left_join_reordering/left_join_cardinality_estimation.test @@ -0,0 +1,25 @@ +# name: test/optimizer/joins/left_join_reordering/left_join_cardinality_estimation.test +# description: Cardinality Estimation Left Join Reodering. +# group: [left_join_reordering] + +statement ok +create table large as select range l_id from range(100000); + +statement ok +create table medium as select range m_id from range(10000); + +statement ok +create table small as select range s_id from range(150); + +# reorder so that large is first left joined with small, then left joined with medium +query II +explain select * from large left join medium on (l_id = m_id) left join small on (l_id = s_id); +---- +physical_plan :.*l_id = m_id.*l_id = s_id.* + + +# reorder so that large is first left joined with small, then left joined with medium +# query II +# explain select * from large left join medium on (l_id = m_id) left join small on (l_id = s_id); +# ---- +# physical_plan :.*~100000 Rows* diff --git a/test/optimizer/joins/left_join_reordering/simple_left_join_tests.test b/test/optimizer/joins/left_join_reordering/simple_left_join_tests.test new file mode 100644 index 000000000000..8a7f8cbaa527 --- /dev/null +++ b/test/optimizer/joins/left_join_reordering/simple_left_join_tests.test @@ -0,0 +1,63 @@ +# name: test/optimizer/joins/left_join_reordering/simple_left_join_tests.test +# description: Reordering left joins produces still valid results +# group: [left_join_reordering] + +statement ok +create table t1 as select range%150 a1, range%150 a2 from range(100000); + +statement ok +create table t2 as select range b1, range b2 from range(10000); + +statement ok +create table t3 as select range c1, range c2 from range(150); + +statement ok +pragma disabled_optimizers='build_side_probe_side, compressed_materialization, column_lifetime'; + +# subquery should be removed and INNER join should be pushed into the left +query II +explain select * from ( + select * from t1 left join + t2 on (a1 = b1 and a2 = b2) + where b2 is null +), +t3 where a2 = c2; +---- +physical_plan :.*PROJECTION.*PROJECTION.* + +# subquery should be removed and INNER join should be pushed into the left +query II +explain select * from ( + select * from t1 left join + t2 on (a1 = b1 and a2 = b2) + where b2 is null +), +t3 where a2 = c2; +---- +physical_plan :.*LEFT.*INNER.* + +statement ok +create or replace table t3 as select range c from range(200); + +statement ok +create or replace table t1 as select range a from range(100); + +statement ok +create or replace table t2 as select range b from range(50); + +# verify filter (b is null) above the left join stays in the plan +query II +explain select * from t1 left join t2 on a = b, t3 where b is null and b = c; +---- +physical_plan :.*FILTER.* + +# statement ok +# create or replace table t1 as select range a, range % 5 b from range(10); +# +# statement ok +# create or replace table t2 as select range c from range(10); +# +# # verify filter (b is null) above the left join stays in the plan +# statement ok +# select * from t1 left join t2 on a = b; + diff --git a/test/optimizer/joins/left_join_reordering/test_reordering_left_joins.test b/test/optimizer/joins/left_join_reordering/test_reordering_left_joins.test new file mode 100644 index 000000000000..709ae184d99c --- /dev/null +++ b/test/optimizer/joins/left_join_reordering/test_reordering_left_joins.test @@ -0,0 +1,47 @@ +# name: test/optimizer/joins/left_join_reordering/test_reordering_left_joins.test +# description: Reordering left joins produces still valid results +# group: [left_join_reordering] + +# make sure no illegal left join reordering happens + +statement ok +pragma enable_verification + +statement ok +pragma disabled_optimizers='build_side_probe_side'; + +statement ok +create table t1 as from values (0), (1), (2), (3), (NULL) t(a); + +statement ok +create table t2 as from values (0), (1) t(b); + +statement ok +create table t3 as from values (1), (2) t(c); + + +# filters (a ⟕ (b ⨝ c)) +query III +select * from t1 left join (select * from t2, t3 where b = c) on a = c order by all; +---- +0 NULL NULL +1 1 1 +2 NULL NULL +3 NULL NULL +NULL NULL NULL + +# illegal reordering of test above ((a ⟕ c) ⨝ b) +query III +select * from (from t1 left join t3 on a=c), t2 where b = c order by all; +---- +1 1 1 + +# this can be reordered +# from (a ⨝ b) ⟕ c) to ((b ⟕ c) ⨝ a) +query III +select * from t1 join t2 on a = b left join t3 on b = c order by all; +---- +0 0 NULL +1 1 1 + +# two && three left joins. \ No newline at end of file diff --git a/test/optimizer/pullup_filters.test b/test/optimizer/pullup_filters.test index 62a76de2335e..ebd00ad8ff31 100644 --- a/test/optimizer/pullup_filters.test +++ b/test/optimizer/pullup_filters.test @@ -2,17 +2,20 @@ # description: Test Filters Pull Up # group: [optimizer] +# TODO fix this test. Now that unnecessary projections are removed, +mode skip + statement ok PRAGMA explain_output = 'PHYSICAL_ONLY' statement ok -CREATE TABLE vals1 AS SELECT i AS i, i AS j FROM range(0, 11, 1) t1(i) +CREATE TABLE vals1 AS SELECT i AS i, i AS j FROM range(0, 11, 1) t1(i); statement ok -CREATE TABLE vals2(k BIGINT, l BIGINT) +CREATE TABLE vals2(k BIGINT, l BIGINT); statement ok -INSERT INTO vals2 SELECT * FROM vals1 +INSERT INTO vals2 SELECT * FROM vals1; ## INNER JOIN: pull up a single filter in cross product from LHS query II diff --git a/test/sql/subquery/test_neumann.test b/test/sql/subquery/test_neumann.test index b4190f528b72..541ce4eaa972 100644 --- a/test/sql/subquery/test_neumann.test +++ b/test/sql/subquery/test_neumann.test @@ -6,28 +6,28 @@ statement ok PRAGMA enable_verification statement ok -CREATE TABLE students(id INTEGER, name VARCHAR, major VARCHAR, year INTEGER) +CREATE TABLE students(id INTEGER, name VARCHAR, major VARCHAR, year INTEGER); statement ok -CREATE TABLE exams(sid INTEGER, course VARCHAR, curriculum VARCHAR, grade INTEGER, year INTEGER) +CREATE TABLE exams(sid INTEGER, course VARCHAR, curriculum VARCHAR, grade INTEGER, year INTEGER); statement ok -INSERT INTO students VALUES (1, 'Mark', 'CS', 2017) +INSERT INTO students VALUES (1, 'Mark', 'CS', 2017); statement ok -INSERT INTO students VALUES (2, 'Dirk', 'CS', 2017) +INSERT INTO students VALUES (2, 'Dirk', 'CS', 2017); statement ok -INSERT INTO exams VALUES (1, 'Database Systems', 'CS', 10, 2015) +INSERT INTO exams VALUES (1, 'Database Systems', 'CS', 10, 2015); statement ok -INSERT INTO exams VALUES (1, 'Graphics', 'CS', 9, 2016) +INSERT INTO exams VALUES (1, 'Graphics', 'CS', 9, 2016); statement ok -INSERT INTO exams VALUES (2, 'Database Systems', 'CS', 7, 2015) +INSERT INTO exams VALUES (2, 'Database Systems', 'CS', 7, 2015); statement ok -INSERT INTO exams VALUES (2, 'Graphics', 'CS', 7, 2016) +INSERT INTO exams VALUES (2, 'Graphics', 'CS', 7, 2016); query TTI SELECT s.name, e.course, e.grade FROM students s, exams e WHERE s.id=e.sid AND e.grade=(SELECT MAX(e2.grade) FROM exams e2 WHERE s.id=e2.sid) ORDER BY name, course;