diff --git a/src/execution/physical_plan/plan_set_operation.cpp b/src/execution/physical_plan/plan_set_operation.cpp index ef0f17554f04..3307f543b230 100644 --- a/src/execution/physical_plan/plan_set_operation.cpp +++ b/src/execution/physical_plan/plan_set_operation.cpp @@ -10,19 +10,6 @@ namespace duckdb { -static vector> CreatePartitionedRowNumExpression(const vector &types) { - vector> res; - auto expr = - make_uniq(ExpressionType::WINDOW_ROW_NUMBER, LogicalType::BIGINT, nullptr, nullptr); - expr->start = WindowBoundary::UNBOUNDED_PRECEDING; - expr->end = WindowBoundary::UNBOUNDED_FOLLOWING; - for (idx_t i = 0; i < types.size(); i++) { - expr->partitions.push_back(make_uniq(types[i], i)); - } - res.push_back(std::move(expr)); - return res; -} - static JoinCondition CreateNotDistinctComparison(const LogicalType &type, idx_t i) { JoinCondition cond; cond.left = make_uniq(type, i); @@ -43,6 +30,8 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalSetOperati throw InvalidInputException("Type mismatch for SET OPERATION"); } + // can't swich logical unions to semi/anti join + // also if the operation is a INTERSECT ALL or EXCEPT ALL switch (op.type) { case LogicalOperatorType::LOGICAL_UNION: // UNION @@ -51,54 +40,8 @@ unique_ptr PhysicalPlanGenerator::CreatePlan(LogicalSetOperati break; case LogicalOperatorType::LOGICAL_EXCEPT: case LogicalOperatorType::LOGICAL_INTERSECT: { - auto &types = left->GetTypes(); - vector conditions; - // create equality condition for all columns - for (idx_t i = 0; i < types.size(); i++) { - conditions.push_back(CreateNotDistinctComparison(types[i], i)); - } - // For EXCEPT ALL / INTERSECT ALL we push a window operator with a ROW_NUMBER into the scans and join to get bag - // semantics. - if (op.setop_all) { - vector window_types = types; - window_types.push_back(LogicalType::BIGINT); - - auto window_left = make_uniq(window_types, CreatePartitionedRowNumExpression(types), - left->estimated_cardinality); - window_left->children.push_back(std::move(left)); - left = std::move(window_left); - - auto window_right = make_uniq(window_types, CreatePartitionedRowNumExpression(types), - right->estimated_cardinality); - window_right->children.push_back(std::move(right)); - right = std::move(window_right); - - // add window expression result to join condition - conditions.push_back(CreateNotDistinctComparison(LogicalType::BIGINT, types.size())); - // join (created below) now includes the row number result column - op.types.push_back(LogicalType::BIGINT); - } - - // EXCEPT is ANTI join - // INTERSECT is SEMI join - PerfectHashJoinStats join_stats; // used in inner joins only - - JoinType join_type = op.type == LogicalOperatorType::LOGICAL_EXCEPT ? JoinType::ANTI : JoinType::SEMI; - result = make_uniq(op, std::move(left), std::move(right), std::move(conditions), join_type, - op.estimated_cardinality, join_stats); - - // For EXCEPT ALL / INTERSECT ALL we need to remove the row number column again - if (op.setop_all) { - vector> projection_select_list; - for (idx_t i = 0; i < types.size(); i++) { - projection_select_list.push_back(make_uniq(types[i], i)); - } - auto projection = - make_uniq(types, std::move(projection_select_list), op.estimated_cardinality); - projection->children.push_back(std::move(result)); - result = std::move(projection); - } - break; + throw InternalException( + "Logical Except/Intersect should have been transformed to semi anti before the physical planning phase"); } default: throw InternalException("Unexpected operator type for set operation"); diff --git a/src/include/duckdb/planner/operator/logical_execute.hpp b/src/include/duckdb/planner/operator/logical_execute.hpp index 62eb28f2e7f0..0c6eff24c804 100644 --- a/src/include/duckdb/planner/operator/logical_execute.hpp +++ b/src/include/duckdb/planner/operator/logical_execute.hpp @@ -34,7 +34,7 @@ class LogicalExecute : public LogicalOperator { protected: void ResolveTypes() override { - // already resolved + types = prepared->types; } vector GetColumnBindings() override { return GenerateColumnBindings(0, types.size()); diff --git a/src/include/duckdb/planner/operator/logical_filter.hpp b/src/include/duckdb/planner/operator/logical_filter.hpp index acd5771b985f..5e969e75ded3 100644 --- a/src/include/duckdb/planner/operator/logical_filter.hpp +++ b/src/include/duckdb/planner/operator/logical_filter.hpp @@ -9,9 +9,46 @@ #pragma once #include "duckdb/planner/logical_operator.hpp" +#include "duckdb/planner/operator/logical_comparison_join.hpp" +#include "duckdb/planner/expression/bound_columnref_expression.hpp" namespace duckdb { +static bool CanFiltersPropogateRightSide(LogicalOperator &op) { + if (op.type != LogicalOperatorType::LOGICAL_COMPARISON_JOIN) { + return false; + } + auto &join = op.Cast(); + if (join.join_type != JoinType::SEMI) { + return false; + } + auto left_bindings = op.children[0]->GetColumnBindings(); + auto right_bindings = op.children[1]->GetColumnBindings(); + D_ASSERT(left_bindings.size() == right_bindings.size()); + // make sure we are comparing every column + if (join.conditions.size() != left_bindings.size()) { + return false; + } + auto &conditions = join.conditions; + for (idx_t i = 0; i < conditions.size(); i++) { + auto &cond = conditions[i]; + auto &left = cond.left; + auto &right = cond.right; + if (cond.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM) { + if (left->type == ExpressionType::BOUND_COLUMN_REF && right->type == ExpressionType::BOUND_COLUMN_REF) { + auto &left_expr = left->Cast(); + auto &right_expr = right->Cast(); + auto left_match = left_expr.binding == left_bindings[i]; + auto right_match = right_expr.binding == right_bindings[i]; + if (!(left_match && right_match)) { + return false; + } + } + } + } + return true; +} + //! LogicalFilter represents a filter operation (e.g. WHERE or HAVING clause) class LogicalFilter : public LogicalOperator { public: diff --git a/src/main/config.cpp b/src/main/config.cpp index 7beb3948d034..7371addbba48 100644 --- a/src/main/config.cpp +++ b/src/main/config.cpp @@ -15,7 +15,7 @@ namespace duckdb { #ifdef DEBUG -bool DBConfigOptions::debug_print_bindings = false; +bool DBConfigOptions::debug_print_bindings = true; #endif #define DUCKDB_GLOBAL(_PARAM) \ diff --git a/src/optimizer/filter_pullup.cpp b/src/optimizer/filter_pullup.cpp index 04986f199570..716751d0b2ce 100644 --- a/src/optimizer/filter_pullup.cpp +++ b/src/optimizer/filter_pullup.cpp @@ -1,5 +1,6 @@ #include "duckdb/optimizer/filter_pullup.hpp" #include "duckdb/planner/operator/logical_join.hpp" +#include "duckdb/planner/operator/logical_filter.hpp" namespace duckdb { @@ -40,8 +41,13 @@ unique_ptr FilterPullup::PullupJoin(unique_ptr case JoinType::INNER: return PullupInnerJoin(std::move(op)); case JoinType::LEFT: - case JoinType::ANTI: + case JoinType::ANTI: { + return PullupFromLeft(std::move(op)); + } case JoinType::SEMI: { + if (CanFiltersPropogateRightSide(*op)) { + return PullupBothSide(std::move(op)); + } return PullupFromLeft(std::move(op)); } default: diff --git a/src/optimizer/join_order/relation_statistics_helper.cpp b/src/optimizer/join_order/relation_statistics_helper.cpp index 5f5b15cbe1f6..79ae88dd4f7d 100644 --- a/src/optimizer/join_order/relation_statistics_helper.cpp +++ b/src/optimizer/join_order/relation_statistics_helper.cpp @@ -244,7 +244,8 @@ RelationStats RelationStatisticsHelper::CombineStatsOfNonReorderableOperator(Log } ret.stats_initialized = true; ret.filter_strength = 1; - ret.table_name = child_stats[0].table_name + " joined with " + child_stats[1].table_name; + ret.table_name = + "(" + child_stats[0].table_name + LogicalOperatorToString(op.type) + child_stats[1].table_name + ")"; for (auto &stats : child_stats) { // MARK joins are nonreorderable. They won't return initialized stats // continue in this case. diff --git a/src/optimizer/pushdown/pushdown_semi_anti_join.cpp b/src/optimizer/pushdown/pushdown_semi_anti_join.cpp index c9506fe05d68..15889c6a2a5f 100644 --- a/src/optimizer/pushdown/pushdown_semi_anti_join.cpp +++ b/src/optimizer/pushdown/pushdown_semi_anti_join.cpp @@ -9,16 +9,55 @@ namespace duckdb { using Filter = FilterPushdown::Filter; +static void ReplaceBindings(vector &bindings, Filter &filter, Expression &expr, + vector &replacement_bindings) { + if (expr.type == ExpressionType::BOUND_COLUMN_REF) { + auto &colref = expr.Cast(); + D_ASSERT(colref.depth == 0); + + // rewrite the binding by looking into the bound_tables list of the subquery + idx_t binding_index = 0; + for (idx_t i = 0; i < bindings.size(); i++) { + if (bindings[i] == colref.binding) { + binding_index = i; + break; + } + } + colref.binding = replacement_bindings[binding_index]; + filter.bindings.insert(colref.binding.table_index); + return; + } + ExpressionIterator::EnumerateChildren( + expr, [&](Expression &child) { ReplaceBindings(bindings, filter, child, replacement_bindings); }); +} + unique_ptr FilterPushdown::PushdownSemiAntiJoin(unique_ptr op) { auto &join = op->Cast(); if (op->type == LogicalOperatorType::LOGICAL_DELIM_JOIN) { return FinishPushdown(std::move(op)); } - // push all current filters down the left side - op->children[0] = Rewrite(std::move(op->children[0])); - FilterPushdown right_pushdown(optimizer); - op->children[1] = right_pushdown.Rewrite(std::move(op->children[1])); + if (CanFiltersPropogateRightSide(*op)) { + auto left_bindings = op->children[0]->GetColumnBindings(); + auto right_bindings = op->children[1]->GetColumnBindings(); + FilterPushdown right_pushdown(optimizer); + for (idx_t i = 0; i < filters.size(); i++) { + // first create a copy of the filter + auto right_filter = make_uniq(); + right_filter->filter = filters[i]->filter->Copy(); + + ReplaceBindings(left_bindings, *right_filter, *right_filter->filter, right_bindings); + right_filter->ExtractBindings(); + + // move the filters into the child pushdown nodes + right_pushdown.filters.push_back(std::move(right_filter)); + } + op->children[0] = Rewrite(std::move(op->children[0])); + op->children[1] = right_pushdown.Rewrite(std::move(op->children[1])); + } else { + // push all current filters down the left side + op->children[0] = Rewrite(std::move(op->children[0])); + } bool left_empty = op->children[0]->type == LogicalOperatorType::LOGICAL_EMPTY_RESULT; bool right_empty = op->children[1]->type == LogicalOperatorType::LOGICAL_EMPTY_RESULT; diff --git a/src/planner/binder/query_node/plan_setop.cpp b/src/planner/binder/query_node/plan_setop.cpp index c313ae016981..3568d91d9c62 100644 --- a/src/planner/binder/query_node/plan_setop.cpp +++ b/src/planner/binder/query_node/plan_setop.cpp @@ -2,11 +2,32 @@ #include "duckdb/planner/expression/bound_cast_expression.hpp" #include "duckdb/planner/expression/bound_columnref_expression.hpp" #include "duckdb/planner/operator/logical_projection.hpp" +#include "duckdb/planner/operator/logical_window.hpp" +#include "duckdb/planner/expression/bound_reference_expression.hpp" +#include "duckdb/planner/expression/bound_window_expression.hpp" #include "duckdb/planner/operator/logical_set_operation.hpp" #include "duckdb/planner/query_node/bound_set_operation_node.hpp" namespace duckdb { +static unique_ptr CreateWindowWithPartitionedRowNum(idx_t window_table_index, + unique_ptr op) { + // instead create a logical projection on top of whatever to add the window expression, then + auto window = make_uniq(window_table_index); + auto row_number = + make_uniq(ExpressionType::WINDOW_ROW_NUMBER, LogicalType::BIGINT, nullptr, nullptr); + row_number->start = WindowBoundary::UNBOUNDED_PRECEDING; + row_number->end = WindowBoundary::CURRENT_ROW_ROWS; + auto bindings = op->GetColumnBindings(); + auto types = op->types; + for (idx_t i = 0; i < types.size(); i++) { + row_number->partitions.push_back(make_uniq(types[i], bindings[i])); + } + window->expressions.push_back(std::move(row_number)); + window->AddChild(std::move(op)); + return window; +} + // Optionally push a PROJECTION operator unique_ptr Binder::CastLogicalOperatorToTypes(vector &source_types, vector &target_types, @@ -116,9 +137,104 @@ unique_ptr Binder::CreatePlan(BoundSetOperationNode &node) { break; } + // here we convert the set operation to anti semi if required. Using the node.setop all we know what conversion we + // need. auto root = make_uniq(node.setop_index, node.types.size(), std::move(left_node), std::move(right_node), logical_type, node.setop_all); + root->ResolveOperatorTypes(); + + unique_ptr op; + + // if we have an intersect or except, immediately translate it to a semi or anti join. + // Unions stay as they are. + if (logical_type == LogicalOperatorType::LOGICAL_INTERSECT || logical_type == LogicalOperatorType::LOGICAL_EXCEPT) { + auto &left = root->children[0]; + auto &right = root->children[1]; + auto left_types = root->children[0]->types; + auto right_types = root->children[1]->types; + auto old_bindings = root->GetColumnBindings(); + if (node.setop_all) { + auto window_left_table_id = GenerateTableIndex(); + root->children[0] = CreateWindowWithPartitionedRowNum(window_left_table_id, std::move(root->children[0])); + + auto window_right_table_id = GenerateTableIndex(); + root->children[1] = CreateWindowWithPartitionedRowNum(window_right_table_id, std::move(root->children[1])); + root->types.push_back(LogicalType::BIGINT); + root->column_count += 1; + } + + auto left_bindings = left->GetColumnBindings(); + auto right_bindings = right->GetColumnBindings(); + D_ASSERT(left_bindings.size() == right_bindings.size()); + + vector conditions; + // create equality condition for all columns + idx_t binding_offset = node.setop_all ? 1 : 0; + for (idx_t i = 0; i < left_bindings.size() - binding_offset; i++) { + auto cond_type_left = LogicalType(LogicalType::UNKNOWN); + auto cond_type_right = LogicalType(LogicalType::UNKNOWN); + JoinCondition cond; + cond.left = make_uniq(left_types[i], left_bindings[i]); + cond.right = make_uniq(right_types[i], right_bindings[i]); + cond.comparison = ExpressionType::COMPARE_NOT_DISTINCT_FROM; + conditions.push_back(std::move(cond)); + } + + // create condition for the row number as well. + if (node.setop_all) { + JoinCondition cond; + cond.left = + make_uniq(LogicalType::BIGINT, left_bindings[left_bindings.size() - 1]); + cond.right = + make_uniq(LogicalType::BIGINT, right_bindings[right_bindings.size() - 1]); + cond.comparison = ExpressionType::COMPARE_NOT_DISTINCT_FROM; + conditions.push_back(std::move(cond)); + } + + JoinType join_type = root->type == LogicalOperatorType::LOGICAL_EXCEPT ? JoinType::ANTI : JoinType::SEMI; + + auto join_op = make_uniq(join_type); + join_op->children.push_back(std::move(left)); + join_op->children.push_back(std::move(right)); + join_op->conditions = std::move(conditions); + join_op->ResolveOperatorTypes(); + + op = std::move(join_op); + + // create projection to remove row_id. + if (node.setop_all) { + vector> projection_select_list; + auto bindings = op->GetColumnBindings(); + for (idx_t i = 0; i < bindings.size() - 1; i++) { + projection_select_list.push_back(make_uniq(op->types[i], bindings[i])); + } + auto projection = make_uniq(node.setop_index, std::move(projection_select_list)); + projection->children.push_back(std::move(op)); + op = std::move(projection); + } + + if (!node.setop_all) { + // push a distinct operator on the join + auto &types = op->types; + auto join_bindings = op->GetColumnBindings(); + vector> distinct_targets; + vector> select_list; + for (idx_t i = 0; i < join_bindings.size(); i++) { + distinct_targets.push_back(make_uniq(types[i], join_bindings[i])); + select_list.push_back(make_uniq(types[i], join_bindings[i])); + } + auto distinct = make_uniq(std::move(distinct_targets), DistinctType::DISTINCT); + distinct->children.push_back(std::move(op)); + op = std::move(distinct); + + auto projection = make_uniq(node.setop_index, std::move(select_list)); + projection->children.push_back(std::move(op)); + op = std::move(projection); + op->ResolveOperatorTypes(); + } + return VisitQueryNode(node, std::move(op)); + } return VisitQueryNode(node, std::move(root)); } diff --git a/test/optimizer/pullup_filters.test b/test/optimizer/pullup_filters.test index af914518b42c..90128260c34f 100644 --- a/test/optimizer/pullup_filters.test +++ b/test/optimizer/pullup_filters.test @@ -6,13 +6,15 @@ statement ok PRAGMA explain_output = 'PHYSICAL_ONLY' statement ok -CREATE TABLE vals1 AS SELECT i AS i, i AS j FROM range(0, 11, 1) t1(i) +CREATE TABLE vals1 AS SELECT i AS i, i AS j FROM range(0, 11, 1) t1(i); statement ok -CREATE TABLE vals2(k BIGINT, l BIGINT) +CREATE TABLE vals2(k BIGINT, l BIGINT); statement ok -INSERT INTO vals2 SELECT * FROM vals1 +INSERT INTO vals2 SELECT * FROM vals1; + +mode skip ## INNER JOIN: pull up a single filter in cross product from LHS query II @@ -74,6 +76,8 @@ EXPLAIN SELECT * FROM (SELECT * FROM vals1, vals2 WHERE i=3 AND k=5 INTERSECT SE ---- physical_plan :((.*=3.*=5.*=3.*=5.*)|(.*=5.*=3.*=5.*=3.*)) +mode unskip + ## INTERSECT: pull up filters from RHS query II EXPLAIN SELECT * FROM (SELECT * FROM vals1, vals2 INTERSECT SELECT * FROM vals1, vals2 WHERE i=3 AND k=5) tbl1; diff --git a/test/optimizer/setops/operation_converter.test b/test/optimizer/setops/operation_converter.test new file mode 100644 index 000000000000..9a799746af14 --- /dev/null +++ b/test/optimizer/setops/operation_converter.test @@ -0,0 +1,13 @@ +# name: test/optimizer/setops/operation_converter.test +# description: converting intersect/except to semi anti +# group: [setops] + +statement ok +create table left_table as select range as a from range(100); + +statement ok +create table right_table as select range*2 as b from range(10000); + +statement ok +select * from (select * from left_table intersect select * from right_table) ll1 where ll1.a < 5; + diff --git a/test/optimizer/pushdown_set_op.test b/test/optimizer/setops/pushdown_set_op.test similarity index 80% rename from test/optimizer/pushdown_set_op.test rename to test/optimizer/setops/pushdown_set_op.test index 27339db1fc92..1b98d31a690f 100644 --- a/test/optimizer/pushdown_set_op.test +++ b/test/optimizer/setops/pushdown_set_op.test @@ -1,69 +1,68 @@ -# name: test/optimizer/pushdown_set_op.test +# name: test/optimizer/setops/pushdown_set_op.test # description: Pushdown set operations -# group: [optimizer] +# group: [setops] statement ok PRAGMA explain_output = 'OPTIMIZED_ONLY' - query II explain select 42 intersect select 42; ---- -logical_opt :.*INTERSECT.* +logical_opt :.*SEMI.* # intersect is empty if either side is empty query II explain select 42 intersect select 42 where 1=0; ---- -logical_opt :.*INTERSECT.* +logical_opt :.*SEMI.* query II explain select 42 where 1=0 intersect select 42; ---- -logical_opt :.*INTERSECT.* +logical_opt :.*SEMI.* # except is empty if LHS is empty query II explain select 42 where 1=0 except select 42; ---- -logical_opt :.*EXCEPT.* +logical_opt :.*ANTI.* # if RHS is empty we can optimize away the except query II explain select 42 except select 42 where 1=0; ---- -logical_opt :.*EXCEPT.* +logical_opt :.*ANTI.* # now pushdown subquery with set ops query II explain select * from (select 42 intersect select 42) tbl(i) where i=42; ---- -logical_opt :.*INTERSECT.* +logical_opt :.*SEMI.* query II explain select * from (select 42 intersect select 43) tbl(i) where i=42; ---- -logical_opt :.*INTERSECT.* +logical_opt :.*SEMI.* query II explain select * from (select 43 intersect select 42) tbl(i) where i=42; ---- -logical_opt :.*INTERSECT.* +logical_opt :.*SEMI.* query II explain select * from (select 42 except select 42) tbl(i) where i=42; ---- -logical_opt :.*EXCEPT.* +logical_opt :.*ANTI.* query II explain select * from (select 42 except select 43) tbl(i) where i=42; ---- -logical_opt :.*EXCEPT.* +logical_opt :.*ANTI.* query II explain select * from (select 43 except select 42) tbl(i) where i=42; ---- -logical_opt :.*EXCEPT.* +logical_opt :.*ANTI.* query I select 42 intersect select 42; diff --git a/test/sql/setops/test_setops.test b/test/sql/setops/test_setops.test index e517543780aa..929816f93741 100644 --- a/test/sql/setops/test_setops.test +++ b/test/sql/setops/test_setops.test @@ -115,7 +115,6 @@ SELECT 1, 'a' UNION ALL SELECT 1, 'a' UNION SELECT 2, 'b' UNION SELECT 1, 'a' OR 2 b 1 a - # EXCEPT ALL / INTERSECT ALL query II select x, count(*) as c diff --git a/test/sql/tpcds/tpcds_sf0.test b/test/sql/tpcds/tpcds_sf0.test index 99264975a793..09f2ee526bce 100644 --- a/test/sql/tpcds/tpcds_sf0.test +++ b/test/sql/tpcds/tpcds_sf0.test @@ -18,19 +18,19 @@ endloop statement error PRAGMA tpcds(-1) ---- +Syntax Error: Out of range TPC-DS query number statement error PRAGMA tpcds(3290819023812038903) ---- +Invalid Input Error statement error PRAGMA tpcds(32908301298) ---- +Invalid Input Error statement error PRAGMA tpcds(1.1) ---- - -# queries -statement ok -SELECT * FROM tpcds_queries() +Binder Error: \ No newline at end of file