diff --git a/Cargo.toml b/Cargo.toml index d92dd1f9bed60..9901efadc4a91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,80 +11,8 @@ members = [ "src/bendpy", "src/bendsave", "src/binaries", - "src/common/auth", - "src/common/base", - "src/common/building", - "src/common/cache", - "src/common/cloud_control", - "src/common/column", - "src/common/compress", - "src/common/exception", - "src/common/frozen_api", - "src/common/grpc", - "src/common/hashtable", - "src/common/http", - "src/common/io", - "src/common/license", - "src/common/metrics", - "src/common/native", - "src/common/storage", - "src/common/telemetry", - "src/common/timezone", - "src/common/tracing", - "src/common/vector", - "src/common/version", "src/meta/*", - "src/query/ast", - "src/query/catalog", "src/query/codegen", - "src/query/config", - "src/query/datavalues", - "src/query/ee_features/attach_table", - "src/query/ee_features/data_mask", - "src/query/ee_features/resources_management", - "src/query/ee_features/storage_encryption", - "src/query/ee_features/stream_handler", - "src/query/ee_features/vacuum_handler", - "src/query/ee_features/virtual_column", - "src/query/ee", - "src/query/expression", - "src/query/formats", - "src/query/functions", - "src/query/functions", - "src/query/functions/src/scalars/arithmetic", - "src/query/functions/src/scalars/geographic", - "src/query/functions/src/scalars/integer_arithmetic", - "src/query/functions/src/scalars/mathematics", - "src/query/functions/src/scalars/numeric_basic_arithmetic", - "src/query/functions/src/scalars/timestamp", - "src/query/management", - "src/query/pipeline", - "src/query/pipeline/transforms", - "src/query/script", - "src/query/service", - "src/query/settings", - "src/query/sql", - "src/query/storages/basic", - "src/query/storages/common/blocks", - "src/query/storages/common/cache", - "src/query/storages/common/index", - "src/query/storages/common/io", - "src/query/storages/common/pruner", - "src/query/storages/common/session", - "src/query/storages/common/stage", - "src/query/storages/common/table_meta", - "src/query/storages/delta", - "src/query/storages/factory", - "src/query/storages/fuse", - "src/query/storages/hive/hive", - "src/query/storages/iceberg", - "src/query/storages/information_schema", - "src/query/storages/orc", - "src/query/storages/parquet", - "src/query/storages/stage", - "src/query/storages/stream", - "src/query/storages/system", - "src/query/users", "src/tests/sqlsmith", "tests/sqllogictests", ] diff --git a/src/query/ast/src/parser/script.rs b/src/query/ast/src/parser/script.rs index efd706e027dc0..4506f655ded46 100644 --- a/src/query/ast/src/parser/script.rs +++ b/src/query/ast/src/parser/script.rs @@ -158,6 +158,7 @@ pub fn script_stmts(i: Input) -> IResult> { semicolon_terminated_list1(script_stmt).parse(i) } +#[recursive::recursive] pub fn script_stmt(i: Input) -> IResult { if let Some(token) = i.tokens.first() { let kind = token.kind; diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_optimized.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_optimized.txt index 3cdb2e1eb02df..df35dc2c3a36c 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_optimized.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_optimized.txt @@ -9,15 +9,15 @@ Aggregate(Final) ├── build keys: [] ├── probe keys: [] ├── other filters: [] - ├── Scan - │ ├── table: default.integers (#0) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Exchange(Broadcast) - └── Scan - ├── table: default.integers (#1) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.integers (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.integers (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_raw.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_raw.txt index e88215acae897..0c4e9f7b88334 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_raw.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/basic/01_cross_join_aggregation_raw.txt @@ -10,12 +10,12 @@ EvalScalar ├── probe keys: [] ├── other filters: [] ├── Scan - │ ├── table: default.integers (#0) + │ ├── table: default.integers (#1) │ ├── filters: [] │ ├── order by: [] │ └── limit: NONE └── Scan - ├── table: default.integers (#1) + ├── table: default.integers (#0) ├── filters: [] ├── order by: [] └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_optimized.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_optimized.txt index aa4ffbe4aafc5..b09f4d0f8f9c5 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_optimized.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_optimized.txt @@ -17,37 +17,37 @@ Exchange(Merge) ├── build keys: [a1z.a0k (#48), a1z.a0n (#50)] ├── probe keys: [a0c.a0k (#7), a0c.a0n (#10)] ├── other filters: [lte(a1z.a2c (#52), a0c.a0d (#0)), gt(a1z.a2k (#61), a0c.a0d (#0))] - ├── Join(Inner) - │ ├── build keys: [a0m (#149)] - │ ├── probe keys: [a0c.a0m (#9)] - │ ├── other filters: [] - │ ├── Join(Inner) - │ │ ├── build keys: [a5r.a5t (#151)] - │ │ ├── probe keys: [a0c.a0l (#8)] - │ │ ├── other filters: [] - │ │ ├── Scan - │ │ │ ├── table: default.a0c (#0) - │ │ │ ├── filters: [gte(a0c.a0d (#0), '20240526'), lte(a0c.a0d (#0), '20250525')] - │ │ │ ├── order by: [] - │ │ │ └── limit: NONE - │ │ └── Exchange(Broadcast) - │ │ └── Scan - │ │ ├── table: default.a5r (#3) - │ │ ├── filters: [eq(substring(a5r.a5w (#156), 1, 1), '1')] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Exchange(Broadcast) - │ └── EvalScalar - │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] - │ └── Scan - │ ├── table: default.a2x (#2) - │ ├── filters: [eq(substring(a2x.a4m (#118), 20, 1), '1')] - │ ├── order by: [] - │ └── limit: NONE - └── Exchange(Broadcast) - └── Scan - ├── table: default.a1z (#1) - ├── filters: [eq(a1z.a2t (#70), '624100')] - ├── order by: [] - └── limit: NONE + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.a1z (#1) + │ ├── filters: [eq(a1z.a2t (#70), '624100')] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [a0m (#149)] + ├── probe keys: [a0c.a0m (#9)] + ├── other filters: [] + ├── Exchange(Broadcast) + │ └── EvalScalar + │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] + │ └── Scan + │ ├── table: default.a2x (#2) + │ ├── filters: [eq(substring(a2x.a4m (#118), 20, 1), '1')] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [a5r.a5t (#151)] + ├── probe keys: [a0c.a0l (#8)] + ├── other filters: [] + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.a5r (#3) + │ ├── filters: [eq(substring(a5r.a5w (#156), 1, 1), '1')] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.a0c (#0) + ├── filters: [gte(a0c.a0d (#0), '20240526'), lte(a0c.a0d (#0), '20250525')] + ├── order by: [] + └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_raw.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_raw.txt index a904e77d3c021..42de32bf98548 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_raw.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_avg_case_expression_raw.txt @@ -13,34 +13,34 @@ EvalScalar ├── build keys: [a5r.a5t (#151)] ├── probe keys: [a0c.a0l (#8)] ├── other filters: [] - ├── Join(Left) - │ ├── build keys: [a0m (#149)] - │ ├── probe keys: [a0c.a0m (#9)] - │ ├── other filters: [] - │ ├── Join(Left) - │ │ ├── build keys: [a1z.a0k (#48), a1z.a0n (#50)] - │ │ ├── probe keys: [a0c.a0k (#7), a0c.a0n (#10)] - │ │ ├── other filters: [lte(a1z.a2c (#52), a0c.a0d (#0)), gt(a1z.a2k (#61), a0c.a0d (#0))] - │ │ ├── Scan - │ │ │ ├── table: default.a0c (#0) - │ │ │ ├── filters: [] - │ │ │ ├── order by: [] - │ │ │ └── limit: NONE - │ │ └── Scan - │ │ ├── table: default.a1z (#1) - │ │ ├── filters: [] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── EvalScalar - │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] - │ └── Scan - │ ├── table: default.a2x (#2) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Scan - ├── table: default.a5r (#3) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Scan + │ ├── table: default.a5r (#3) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Left) + ├── build keys: [a0m (#149)] + ├── probe keys: [a0c.a0m (#9)] + ├── other filters: [] + ├── EvalScalar + │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] + │ └── Scan + │ ├── table: default.a2x (#2) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Left) + ├── build keys: [a1z.a0k (#48), a1z.a0n (#50)] + ├── probe keys: [a0c.a0k (#7), a0c.a0n (#10)] + ├── other filters: [lte(a1z.a2c (#52), a0c.a0d (#0)), gt(a1z.a2k (#61), a0c.a0d (#0))] + ├── Scan + │ ├── table: default.a1z (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.a0c (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_optimized.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_optimized.txt index d9458b8309733..fe069d1956860 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_optimized.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_optimized.txt @@ -15,37 +15,37 @@ Exchange(Merge) ├── build keys: [a1z.a0k (#48), a1z.a0n (#50)] ├── probe keys: [a0c.a0k (#7), a0c.a0n (#10)] ├── other filters: [lte(a1z.a2c (#52), a0c.a0d (#0)), gt(a1z.a2k (#61), a0c.a0d (#0))] - ├── Join(Inner) - │ ├── build keys: [a0m (#149)] - │ ├── probe keys: [a0c.a0m (#9)] - │ ├── other filters: [] - │ ├── Join(Inner) - │ │ ├── build keys: [a5r.a5t (#151)] - │ │ ├── probe keys: [a0c.a0l (#8)] - │ │ ├── other filters: [] - │ │ ├── Scan - │ │ │ ├── table: default.a0c (#0) - │ │ │ ├── filters: [gte(a0c.a0d (#0), '20240526'), lte(a0c.a0d (#0), '20250525')] - │ │ │ ├── order by: [] - │ │ │ └── limit: NONE - │ │ └── Exchange(Broadcast) - │ │ └── Scan - │ │ ├── table: default.a5r (#3) - │ │ ├── filters: [eq(substring(a5r.a5w (#156), 1, 1), '1')] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Exchange(Broadcast) - │ └── EvalScalar - │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] - │ └── Scan - │ ├── table: default.a2x (#2) - │ ├── filters: [eq(substring(a2x.a4m (#118), 20, 1), '1')] - │ ├── order by: [] - │ └── limit: NONE - └── Exchange(Broadcast) - └── Scan - ├── table: default.a1z (#1) - ├── filters: [eq(a1z.a2t (#70), '624100')] - ├── order by: [] - └── limit: NONE + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.a1z (#1) + │ ├── filters: [eq(a1z.a2t (#70), '624100')] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [a0m (#149)] + ├── probe keys: [a0c.a0m (#9)] + ├── other filters: [] + ├── Exchange(Broadcast) + │ └── EvalScalar + │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] + │ └── Scan + │ ├── table: default.a2x (#2) + │ ├── filters: [eq(substring(a2x.a4m (#118), 20, 1), '1')] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [a5r.a5t (#151)] + ├── probe keys: [a0c.a0l (#8)] + ├── other filters: [] + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.a5r (#3) + │ ├── filters: [eq(substring(a5r.a5w (#156), 1, 1), '1')] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.a0c (#0) + ├── filters: [gte(a0c.a0d (#0), '20240526'), lte(a0c.a0d (#0), '20250525')] + ├── order by: [] + └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_raw.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_raw.txt index 0cf3cb54cb3d5..b3574a9ed43c0 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_raw.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/obfuscated/01_multi_join_sum_case_expression_raw.txt @@ -13,34 +13,34 @@ EvalScalar ├── build keys: [a5r.a5t (#151)] ├── probe keys: [a0c.a0l (#8)] ├── other filters: [] - ├── Join(Left) - │ ├── build keys: [a0m (#149)] - │ ├── probe keys: [a0c.a0m (#9)] - │ ├── other filters: [] - │ ├── Join(Left) - │ │ ├── build keys: [a1z.a0k (#48), a1z.a0n (#50)] - │ │ ├── probe keys: [a0c.a0k (#7), a0c.a0n (#10)] - │ │ ├── other filters: [lte(a1z.a2c (#52), a0c.a0d (#0)), gt(a1z.a2k (#61), a0c.a0d (#0))] - │ │ ├── Scan - │ │ │ ├── table: default.a0c (#0) - │ │ │ ├── filters: [] - │ │ │ ├── order by: [] - │ │ │ └── limit: NONE - │ │ └── Scan - │ │ ├── table: default.a1z (#1) - │ │ ├── filters: [] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── EvalScalar - │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] - │ └── Scan - │ ├── table: default.a2x (#2) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Scan - ├── table: default.a5r (#3) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Scan + │ ├── table: default.a5r (#3) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Left) + ├── build keys: [a0m (#149)] + ├── probe keys: [a0c.a0m (#9)] + ├── other filters: [] + ├── EvalScalar + │ ├── scalars: [CAST(a2x.a0m (#74) AS String NULL) AS (#149)] + │ └── Scan + │ ├── table: default.a2x (#2) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Left) + ├── build keys: [a1z.a0k (#48), a1z.a0n (#50)] + ├── probe keys: [a0c.a0k (#7), a0c.a0n (#10)] + ├── other filters: [lte(a1z.a2c (#52), a0c.a0d (#0)), gt(a1z.a2k (#61), a0c.a0d (#0))] + ├── Scan + │ ├── table: default.a1z (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.a0c (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_optimized.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_optimized.txt index 89654c27e1c7c..64e7bdd637d89 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_optimized.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_optimized.txt @@ -14,85 +14,85 @@ Limit ├── build keys: [customer.c_customer_sk (#78)] ├── probe keys: [store_returns.sr_customer_sk (#3)] ├── other filters: [] - ├── Join(Inner) - │ ├── build keys: [store_returns.sr_store_sk (#103)] - │ ├── probe keys: [store_returns.sr_store_sk (#7)] - │ ├── other filters: [gt(Sum(sr_return_amt) (#48), sum(ctr_total_return) / if(count(ctr_total_return) = 0, 1, count(ctr_total_return)) * 1.2 (#147))] - │ ├── Aggregate(Final) - │ │ ├── group items: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7)] - │ │ ├── aggregate functions: [sum(store_returns.sr_return_amt (#11)) AS (#48)] - │ │ └── Aggregate(Partial) - │ │ ├── group items: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7)] - │ │ ├── aggregate functions: [sum(store_returns.sr_return_amt (#11)) AS (#48)] - │ │ └── Exchange(Hash) - │ │ ├── Exchange(Hash): keys: [store_returns.sr_customer_sk (#3)] - │ │ └── EvalScalar - │ │ ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), store_returns.sr_return_amt (#11) AS (#11), store_returns.sr_returned_date_sk (#0) AS (#148), date_dim.d_date_sk (#20) AS (#149), date_dim.d_year (#26) AS (#150)] - │ │ └── Join(Inner) - │ │ ├── build keys: [date_dim.d_date_sk (#20)] - │ │ ├── probe keys: [store_returns.sr_returned_date_sk (#0)] - │ │ ├── other filters: [] - │ │ ├── Scan - │ │ │ ├── table: default.store_returns (#0) - │ │ │ ├── filters: [] - │ │ │ ├── order by: [] - │ │ │ └── limit: NONE - │ │ └── Exchange(Broadcast) - │ │ └── Scan - │ │ ├── table: default.date_dim (#1) - │ │ ├── filters: [eq(date_dim.d_year (#26), 2001)] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Exchange(Broadcast) - │ └── Join(Inner) - │ ├── build keys: [store_returns.sr_store_sk (#103)] - │ ├── probe keys: [store.s_store_sk (#49)] - │ ├── other filters: [] - │ ├── Scan - │ │ ├── table: default.store (#2) - │ │ ├── filters: [eq(store.s_state (#73), 'TN')] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Exchange(Broadcast) - │ └── EvalScalar - │ ├── scalars: [store_returns.sr_store_sk (#103) AS (#103), multiply(divide(sum(ctr_total_return) (#145), if(eq(count(ctr_total_return) (#146), 0), 1, count(ctr_total_return) (#146))), 1.2) AS (#147)] - │ └── Aggregate(Final) - │ ├── group items: [store_returns.sr_store_sk (#103) AS (#103)] - │ ├── aggregate functions: [sum(Sum(sr_return_amt) (#144)) AS (#145), count(Sum(sr_return_amt) (#144)) AS (#146)] - │ └── Aggregate(Partial) - │ ├── group items: [store_returns.sr_store_sk (#103) AS (#103)] - │ ├── aggregate functions: [sum(Sum(sr_return_amt) (#144)) AS (#145), count(Sum(sr_return_amt) (#144)) AS (#146)] - │ └── Exchange(Hash) - │ ├── Exchange(Hash): keys: [store_returns.sr_store_sk (#103)] - │ └── Aggregate(Final) - │ ├── group items: [store_returns.sr_customer_sk (#99) AS (#99), store_returns.sr_store_sk (#103) AS (#103)] - │ ├── aggregate functions: [sum(store_returns.sr_return_amt (#107)) AS (#144)] - │ └── Aggregate(Partial) - │ ├── group items: [store_returns.sr_customer_sk (#99) AS (#99), store_returns.sr_store_sk (#103) AS (#103)] - │ ├── aggregate functions: [sum(store_returns.sr_return_amt (#107)) AS (#144)] - │ └── Exchange(Hash) - │ ├── Exchange(Hash): keys: [store_returns.sr_customer_sk (#99)] - │ └── EvalScalar - │ ├── scalars: [store_returns.sr_customer_sk (#99) AS (#99), store_returns.sr_store_sk (#103) AS (#103), store_returns.sr_return_amt (#107) AS (#107), store_returns.sr_returned_date_sk (#96) AS (#151), date_dim.d_date_sk (#116) AS (#152), date_dim.d_year (#122) AS (#153)] - │ └── Join(Inner) - │ ├── build keys: [date_dim.d_date_sk (#116)] - │ ├── probe keys: [store_returns.sr_returned_date_sk (#96)] - │ ├── other filters: [] - │ ├── Scan - │ │ ├── table: default.store_returns (#4) - │ │ ├── filters: [] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Exchange(Broadcast) - │ └── Scan - │ ├── table: default.date_dim (#5) - │ ├── filters: [eq(date_dim.d_year (#122), 2001)] - │ ├── order by: [] - │ └── limit: NONE - └── Exchange(Broadcast) - └── Scan - ├── table: default.customer (#3) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.customer (#3) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [store_returns.sr_store_sk (#103)] + ├── probe keys: [store_returns.sr_store_sk (#7)] + ├── other filters: [gt(Sum(sr_return_amt) (#48), sum(ctr_total_return) / if(count(ctr_total_return) = 0, 1, count(ctr_total_return)) * 1.2 (#147))] + ├── Exchange(Broadcast) + │ └── Join(Inner) + │ ├── build keys: [store_returns.sr_store_sk (#103)] + │ ├── probe keys: [store.s_store_sk (#49)] + │ ├── other filters: [] + │ ├── Exchange(Broadcast) + │ │ └── EvalScalar + │ │ ├── scalars: [store_returns.sr_store_sk (#103) AS (#103), multiply(divide(sum(ctr_total_return) (#145), if(eq(count(ctr_total_return) (#146), 0), 1, count(ctr_total_return) (#146))), 1.2) AS (#147)] + │ │ └── Aggregate(Final) + │ │ ├── group items: [store_returns.sr_store_sk (#103) AS (#103)] + │ │ ├── aggregate functions: [sum(Sum(sr_return_amt) (#144)) AS (#145), count(Sum(sr_return_amt) (#144)) AS (#146)] + │ │ └── Aggregate(Partial) + │ │ ├── group items: [store_returns.sr_store_sk (#103) AS (#103)] + │ │ ├── aggregate functions: [sum(Sum(sr_return_amt) (#144)) AS (#145), count(Sum(sr_return_amt) (#144)) AS (#146)] + │ │ └── Exchange(Hash) + │ │ ├── Exchange(Hash): keys: [store_returns.sr_store_sk (#103)] + │ │ └── Aggregate(Final) + │ │ ├── group items: [store_returns.sr_customer_sk (#99) AS (#99), store_returns.sr_store_sk (#103) AS (#103)] + │ │ ├── aggregate functions: [sum(store_returns.sr_return_amt (#107)) AS (#144)] + │ │ └── Aggregate(Partial) + │ │ ├── group items: [store_returns.sr_customer_sk (#99) AS (#99), store_returns.sr_store_sk (#103) AS (#103)] + │ │ ├── aggregate functions: [sum(store_returns.sr_return_amt (#107)) AS (#144)] + │ │ └── Exchange(Hash) + │ │ ├── Exchange(Hash): keys: [store_returns.sr_customer_sk (#99)] + │ │ └── EvalScalar + │ │ ├── scalars: [store_returns.sr_customer_sk (#99) AS (#99), store_returns.sr_store_sk (#103) AS (#103), store_returns.sr_return_amt (#107) AS (#107), store_returns.sr_returned_date_sk (#96) AS (#151), date_dim.d_date_sk (#116) AS (#152), date_dim.d_year (#122) AS (#153)] + │ │ └── Join(Inner) + │ │ ├── build keys: [date_dim.d_date_sk (#116)] + │ │ ├── probe keys: [store_returns.sr_returned_date_sk (#96)] + │ │ ├── other filters: [] + │ │ ├── Exchange(Broadcast) + │ │ │ └── Scan + │ │ │ ├── table: default.date_dim (#5) + │ │ │ ├── filters: [eq(date_dim.d_year (#122), 2001)] + │ │ │ ├── order by: [] + │ │ │ └── limit: NONE + │ │ └── Scan + │ │ ├── table: default.store_returns (#4) + │ │ ├── filters: [] + │ │ ├── order by: [] + │ │ └── limit: NONE + │ └── Scan + │ ├── table: default.store (#2) + │ ├── filters: [eq(store.s_state (#73), 'TN')] + │ ├── order by: [] + │ └── limit: NONE + └── Aggregate(Final) + ├── group items: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7)] + ├── aggregate functions: [sum(store_returns.sr_return_amt (#11)) AS (#48)] + └── Aggregate(Partial) + ├── group items: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7)] + ├── aggregate functions: [sum(store_returns.sr_return_amt (#11)) AS (#48)] + └── Exchange(Hash) + ├── Exchange(Hash): keys: [store_returns.sr_customer_sk (#3)] + └── EvalScalar + ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), store_returns.sr_return_amt (#11) AS (#11), store_returns.sr_returned_date_sk (#0) AS (#148), date_dim.d_date_sk (#20) AS (#149), date_dim.d_year (#26) AS (#150)] + └── Join(Inner) + ├── build keys: [date_dim.d_date_sk (#20)] + ├── probe keys: [store_returns.sr_returned_date_sk (#0)] + ├── other filters: [] + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.date_dim (#1) + │ ├── filters: [eq(date_dim.d_year (#26), 2001)] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.store_returns (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_raw.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_raw.txt index 0ab2b3a083eaf..62417689e68b2 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_raw.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q01_raw.txt @@ -34,12 +34,12 @@ Limit │ ├── probe keys: [] │ ├── other filters: [] │ ├── Scan - │ │ ├── table: default.store_returns (#4) + │ │ ├── table: default.date_dim (#5) │ │ ├── filters: [] │ │ ├── order by: [] │ │ └── limit: NONE │ └── Scan - │ ├── table: default.date_dim (#5) + │ ├── table: default.store_returns (#4) │ ├── filters: [] │ ├── order by: [] │ └── limit: NONE @@ -47,41 +47,41 @@ Limit ├── build keys: [] ├── probe keys: [] ├── other filters: [] - ├── Join(Cross) - │ ├── build keys: [] - │ ├── probe keys: [] - │ ├── other filters: [] - │ ├── EvalScalar - │ │ ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), Sum(sr_return_amt) (#48) AS (#48)] - │ │ └── Aggregate(Initial) - │ │ ├── group items: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7)] - │ │ ├── aggregate functions: [sum(store_returns.sr_return_amt (#11)) AS (#48)] - │ │ └── EvalScalar - │ │ ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), store_returns.sr_return_amt (#11) AS (#11)] - │ │ └── Filter - │ │ ├── filters: [eq(store_returns.sr_returned_date_sk (#0), date_dim.d_date_sk (#20)), eq(date_dim.d_year (#26), 2001)] - │ │ └── Join(Cross) - │ │ ├── build keys: [] - │ │ ├── probe keys: [] - │ │ ├── other filters: [] - │ │ ├── Scan - │ │ │ ├── table: default.store_returns (#0) - │ │ │ ├── filters: [] - │ │ │ ├── order by: [] - │ │ │ └── limit: NONE - │ │ └── Scan - │ │ ├── table: default.date_dim (#1) - │ │ ├── filters: [] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Scan - │ ├── table: default.store (#2) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Scan - ├── table: default.customer (#3) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Scan + │ ├── table: default.customer (#3) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── Scan + │ ├── table: default.store (#2) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── EvalScalar + ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), Sum(sr_return_amt) (#48) AS (#48)] + └── Aggregate(Initial) + ├── group items: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7)] + ├── aggregate functions: [sum(store_returns.sr_return_amt (#11)) AS (#48)] + └── EvalScalar + ├── scalars: [store_returns.sr_customer_sk (#3) AS (#3), store_returns.sr_store_sk (#7) AS (#7), store_returns.sr_return_amt (#11) AS (#11)] + └── Filter + ├── filters: [eq(store_returns.sr_returned_date_sk (#0), date_dim.d_date_sk (#20)), eq(date_dim.d_year (#26), 2001)] + └── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── Scan + │ ├── table: default.date_dim (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.store_returns (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_optimized.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_optimized.txt index 58a34ce6b9083..4307b9b33c727 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_optimized.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_optimized.txt @@ -22,25 +22,25 @@ Limit ├── build keys: [item.i_item_sk (#51)] ├── probe keys: [store_sales.ss_item_sk (#30)] ├── other filters: [] - ├── Join(Inner) - │ ├── build keys: [date_dim.d_date_sk (#0)] - │ ├── probe keys: [store_sales.ss_sold_date_sk (#28)] - │ ├── other filters: [] - │ ├── Scan - │ │ ├── table: default.store_sales (#1) - │ │ ├── filters: [] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Exchange(Broadcast) - │ └── Scan - │ ├── table: default.date_dim (#0) - │ ├── filters: [eq(date_dim.d_moy (#8), 11)] - │ ├── order by: [] - │ └── limit: NONE - └── Exchange(Broadcast) + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.item (#2) + │ ├── filters: [eq(item.i_manufact_id (#64), 128)] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Inner) + ├── build keys: [date_dim.d_date_sk (#0)] + ├── probe keys: [store_sales.ss_sold_date_sk (#28)] + ├── other filters: [] + ├── Exchange(Broadcast) + │ └── Scan + │ ├── table: default.date_dim (#0) + │ ├── filters: [eq(date_dim.d_moy (#8), 11)] + │ ├── order by: [] + │ └── limit: NONE └── Scan - ├── table: default.item (#2) - ├── filters: [eq(item.i_manufact_id (#64), 128)] + ├── table: default.store_sales (#1) + ├── filters: [] ├── order by: [] └── limit: NONE diff --git a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_raw.txt b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_raw.txt index c7ec0d351f19e..8112b3033ebba 100644 --- a/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_raw.txt +++ b/src/query/service/tests/it/sql/planner/optimizer/data/results/tpcds/Q03_raw.txt @@ -17,23 +17,23 @@ Limit ├── build keys: [] ├── probe keys: [] ├── other filters: [] - ├── Join(Cross) - │ ├── build keys: [] - │ ├── probe keys: [] - │ ├── other filters: [] - │ ├── Scan - │ │ ├── table: default.date_dim (#0) - │ │ ├── filters: [] - │ │ ├── order by: [] - │ │ └── limit: NONE - │ └── Scan - │ ├── table: default.store_sales (#1) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Scan - ├── table: default.item (#2) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Scan + │ ├── table: default.item (#2) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── Scan + │ ├── table: default.store_sales (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.date_dim (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE diff --git a/src/query/sql/src/planner/format/display.rs b/src/query/sql/src/planner/format/display.rs index 2fa123d0e4e70..95ae80f99095c 100644 --- a/src/query/sql/src/planner/format/display.rs +++ b/src/query/sql/src/planner/format/display.rs @@ -190,12 +190,19 @@ where )); } - let children = s_expr - .children() - .map(|s_expr| self.humanize_s_expr(s_expr)) - .collect::>>()?; + if s_expr.plan.is_join() { + tree.children + .push(self.humanize_s_expr(s_expr.build_side_child())?); + tree.children + .push(self.humanize_s_expr(s_expr.probe_side_child())?); + } else { + let children = s_expr + .children() + .map(|s_expr| self.humanize_s_expr(s_expr)) + .collect::>>()?; + tree.children.extend(children); + }; - tree.children.extend(children); Ok(tree) } @@ -245,7 +252,10 @@ where let column = self.id_humanizer.humanize_column_id(*column); let hist = format!( "{{ min: {}, max: {}, ndv: {}, null count: {} }}", - hist.min, hist.max, hist.ndv, hist.null_count + hist.min, + hist.max, + hist.ndv.value(), + hist.null_count ); FormatTreeNode::new(format!("{}: {}", column, hist)) }) diff --git a/src/query/sql/src/planner/optimizer/ir/mod.rs b/src/query/sql/src/planner/optimizer/ir/mod.rs index 76482ede84989..cdd97501712d4 100644 --- a/src/query/sql/src/planner/optimizer/ir/mod.rs +++ b/src/query/sql/src/planner/optimizer/ir/mod.rs @@ -44,6 +44,6 @@ pub use stats::ColumnStat; pub use stats::ColumnStatSet; pub use stats::HistogramBuilder; pub use stats::MAX_SELECTIVITY; -pub use stats::NewStatistic; +pub use stats::Ndv; pub use stats::SelectivityEstimator; pub use stats::UniformSampleSet; diff --git a/src/query/sql/src/planner/optimizer/ir/stats/column_stat.rs b/src/query/sql/src/planner/optimizer/ir/stats/column_stat.rs index 8fa854bd56864..79b407cebe535 100644 --- a/src/query/sql/src/planner/optimizer/ir/stats/column_stat.rs +++ b/src/query/sql/src/planner/optimizer/ir/stats/column_stat.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use databend_common_storage::Datum; use databend_common_storage::Histogram; +use super::selectivity::DEFAULT_SELECTIVITY; use crate::IndexType; pub type ColumnStatSet = HashMap; @@ -31,7 +32,7 @@ pub struct ColumnStat { pub max: Datum, /// Number of distinct values - pub ndv: f64, + pub ndv: Ndv, /// Count of null values pub null_count: u64, @@ -40,9 +41,45 @@ pub struct ColumnStat { pub histogram: Option, } -#[derive(Debug, Clone)] -pub struct NewStatistic { - pub min: Option, - pub max: Option, - pub ndv: Option, +#[derive(Debug, Clone, Copy)] +pub enum Ndv { + // safe for selectivity + Stat(f64), + Max(f64), +} + +impl Ndv { + pub fn reduce(self, ndv: f64) -> Self { + match self { + Ndv::Stat(v) => Ndv::Stat(v.min(ndv)), + Ndv::Max(v) => Ndv::Max(v.min(ndv)), + } + } + + pub fn reduce_by_selectivity(self, selectivity: f64) -> Self { + match self { + Ndv::Stat(v) => Ndv::Stat((v * selectivity).ceil()), + Ndv::Max(v) => Ndv::Max((v * selectivity).ceil()), + } + } + + pub fn value(self) -> f64 { + match self { + Ndv::Stat(v) => v, + Ndv::Max(v) => v, + } + } + + pub fn equal_selectivity(&self, not: bool) -> f64 { + let ndv = self.value(); + if ndv == 0.0 { + 0.0 + } else { + let selectivity = if not { 1.0 - 1.0 / ndv } else { 1.0 / ndv }; + match self { + Ndv::Stat(_) => selectivity, + Ndv::Max(_) => selectivity.max(DEFAULT_SELECTIVITY), + } + } + } } diff --git a/src/query/sql/src/planner/optimizer/ir/stats/mod.rs b/src/query/sql/src/planner/optimizer/ir/stats/mod.rs index c0a96b9f0c308..29ab0e8b05503 100644 --- a/src/query/sql/src/planner/optimizer/ir/stats/mod.rs +++ b/src/query/sql/src/planner/optimizer/ir/stats/mod.rs @@ -16,9 +16,7 @@ mod column_stat; mod histogram; mod selectivity; -pub use column_stat::ColumnStat; -pub use column_stat::ColumnStatSet; -pub use column_stat::NewStatistic; +pub use column_stat::*; pub use histogram::HistogramBuilder; pub use histogram::UniformSampleSet; pub use selectivity::MAX_SELECTIVITY; diff --git a/src/query/sql/src/planner/optimizer/ir/stats/selectivity.rs b/src/query/sql/src/planner/optimizer/ir/stats/selectivity.rs index bd66ccd53fd74..195dc932ddf0f 100644 --- a/src/query/sql/src/planner/optimizer/ir/stats/selectivity.rs +++ b/src/query/sql/src/planner/optimizer/ir/stats/selectivity.rs @@ -24,8 +24,6 @@ use databend_common_expression::Expr; use databend_common_expression::FunctionContext; use databend_common_expression::Scalar; use databend_common_expression::type_check; -use databend_common_expression::types::DataType; -use databend_common_expression::types::NumberDataType; use databend_common_expression::types::NumberScalar; use databend_common_functions::BUILTIN_FUNCTIONS; use databend_common_storage::DEFAULT_HISTOGRAM_BUCKETS; @@ -249,7 +247,7 @@ impl<'a> SelectivityEstimator<'a> { // For equal predicate, we just use cardinality of a single // value to estimate the selectivity. This assumes that // the column is in a uniform distribution. - let selectivity = evaluate_equal(column_stat, constant); + let selectivity = evaluate_equal(column_stat, false, constant); if update { update_statistic( column_stat, @@ -263,7 +261,7 @@ impl<'a> SelectivityEstimator<'a> { } ComparisonOp::NotEqual => { // For not equal predicate, we treat it as opposite of equal predicate. - let selectivity = 1.0 - evaluate_equal(column_stat, constant); + let selectivity = evaluate_equal(column_stat, true, constant); if update { update_statistic( column_stat, @@ -342,25 +340,26 @@ impl<'a> SelectivityEstimator<'a> { // Update other columns' statistic according to selectivity. pub fn update_other_statistic_by_selectivity(&mut self, selectivity: f64) { for (index, column_stat) in self.input_stat.column_stats.iter_mut() { - if !self.updated_column_indexes.contains(index) { - let new_ndv = (column_stat.ndv * selectivity).ceil(); - column_stat.ndv = new_ndv; - if let Some(histogram) = &mut column_stat.histogram { - if histogram.accuracy { - // If selectivity < 0.2, most buckets are invalid and - // the accuracy histogram can be discarded. - // Todo: find a better way to update histogram. - if selectivity < 0.2 { - column_stat.histogram = None; - } - continue; - } - if new_ndv as u64 <= 2 { + if self.updated_column_indexes.contains(index) { + continue; + } + column_stat.ndv = column_stat.ndv.reduce_by_selectivity(selectivity); + + if let Some(histogram) = &mut column_stat.histogram { + if histogram.accuracy { + // If selectivity < 0.2, most buckets are invalid and + // the accuracy histogram can be discarded. + // Todo: find a better way to update histogram. + if selectivity < 0.2 { column_stat.histogram = None; - } else { - for bucket in histogram.buckets.iter_mut() { - bucket.update(selectivity); - } + } + continue; + } + if column_stat.ndv.value() as u64 <= 2 { + column_stat.histogram = None; + } else { + for bucket in histogram.buckets.iter_mut() { + bucket.update(selectivity); } } } @@ -375,161 +374,123 @@ impl<'a> SelectivityEstimator<'a> { column_stat: &mut ColumnStat, updated_column_indexes: &mut HashSet, ) -> Result { - let histogram = column_stat.histogram.as_ref(); - - if histogram.is_none() && const_datum.is_numeric() { - // If there is no histogram and the column isn't numeric, return default selectivity. - if !column_stat.min.is_numeric() || !column_stat.max.is_numeric() { - return Ok(DEFAULT_SELECTIVITY); - } - - let min = column_stat.min.to_double()?; - let max = column_stat.max.to_double()?; - let ndv = column_stat.ndv; - let numeric_literal = const_datum.to_double()?; - - let (no_overlap, complete_overlap) = match comparison_op { - ComparisonOp::LT => (numeric_literal <= min, numeric_literal > max), - ComparisonOp::LTE => (numeric_literal < min, numeric_literal >= max), - ComparisonOp::GT => (numeric_literal >= max, numeric_literal < min), - ComparisonOp::GTE => (numeric_literal > max, numeric_literal <= min), - _ => unreachable!(), - }; + let selectivity = match column_stat.histogram.as_ref() { + None if const_datum.is_numeric() => { + // If there is no histogram and the column isn't numeric, return default selectivity. + if !column_stat.min.is_numeric() || !column_stat.max.is_numeric() { + return Ok(DEFAULT_SELECTIVITY); + } - let percent = if no_overlap { - 0.0 - } else if complete_overlap { - 1.0 - } else { - match comparison_op { - ComparisonOp::LT => { - if numeric_literal == max { - 1.0 - 1.0 / ndv - } else { - (numeric_literal - min) / (max - min + 1.0) - } - } - ComparisonOp::LTE => { - if numeric_literal == min { - 1.0 / ndv - } else { - (numeric_literal - min) / (max - min + 1.0) - } - } - ComparisonOp::GT => { - if numeric_literal == min { - 1.0 - 1.0 / ndv - } else { - (max - numeric_literal) / (max - min + 1.0) - } + let min = column_stat.min.to_double()?; + let max = column_stat.max.to_double()?; + let ndv = column_stat.ndv; + let numeric_literal = const_datum.to_double()?; + + let cmp_min = numeric_literal.total_cmp(&min); + let cmp_max = numeric_literal.total_cmp(&max); + + use Ordering::*; + match (comparison_op, cmp_min, cmp_max) { + (ComparisonOp::LT, Less | Equal, _) => 0.0, + (ComparisonOp::LTE, Less, _) => 0.0, + (ComparisonOp::LTE, Equal, _) => ndv.equal_selectivity(false), + (ComparisonOp::LT | ComparisonOp::LTE, Greater, Greater) => 1.0, + (ComparisonOp::LT, Greater, Equal) => ndv.equal_selectivity(true), + (ComparisonOp::LT | ComparisonOp::LTE, _, _) => { + (numeric_literal - min) / (max - min + 1.0) } - ComparisonOp::GTE => { - if numeric_literal == max { - 1.0 / ndv - } else { - (max - numeric_literal) / (max - min + 1.0) - } + + (ComparisonOp::GT, _, Greater | Equal) => 0.0, + (ComparisonOp::GTE, _, Greater) => 0.0, + (ComparisonOp::GTE, Less | Equal, _) => 1.0, + (ComparisonOp::GT, Less, _) => 1.0, + (ComparisonOp::GT, Equal, _) => ndv.equal_selectivity(true), + (ComparisonOp::GTE, _, Equal) => ndv.equal_selectivity(false), + (ComparisonOp::GT | ComparisonOp::GTE, _, _) => { + (max - numeric_literal) / (max - min + 1.0) } + _ => unreachable!(), } - }; - - if update { - let new_min = if matches!(comparison_op, ComparisonOp::GT | ComparisonOp::GTE) { - const_datum.clone() - } else { - column_stat.min.clone() - }; - - let new_max = if matches!(comparison_op, ComparisonOp::LT | ComparisonOp::LTE) { - const_datum.clone() - } else { - column_stat.max.clone() - }; - - update_statistic(column_stat, new_min, new_max, percent)?; - updated_column_indexes.insert(column_ref.column.index); } - - return Ok(percent); - } - - let Some(histogram) = histogram else { - return Ok(DEFAULT_SELECTIVITY); - }; - - let mut num_selected = 0.0; - for bucket in histogram.buckets_iter() { - let lower_bound = bucket.lower_bound(); - let upper_bound = bucket.upper_bound(); - - if !const_datum.can_compare(lower_bound) { + None => { return Ok(DEFAULT_SELECTIVITY); } - - let const_gte_upper_bound = matches!( - const_datum.compare(upper_bound)?, - Ordering::Greater | Ordering::Equal - ); - let (no_overlap, complete_overlap) = match comparison_op { - ComparisonOp::LT => ( - matches!( - const_datum.compare(lower_bound)?, - Ordering::Less | Ordering::Equal - ), - const_gte_upper_bound, - ), - ComparisonOp::LTE => ( - matches!(const_datum.compare(lower_bound)?, Ordering::Less), - const_gte_upper_bound, - ), - ComparisonOp::GT => ( - const_gte_upper_bound, - matches!(const_datum.compare(lower_bound)?, Ordering::Less), - ), - ComparisonOp::GTE => ( - const_gte_upper_bound, - matches!( - const_datum.compare(lower_bound)?, - Ordering::Less | Ordering::Equal - ), - ), - _ => unreachable!(), - }; - - if complete_overlap { - num_selected += bucket.num_values(); - } else if !no_overlap && const_datum.is_numeric() { - let ndv = bucket.num_distinct(); - let lower_bound = lower_bound.to_double()?; - let upper_bound = upper_bound.to_double()?; - let const_value = const_datum.to_double()?; - - let bucket_range = upper_bound - lower_bound; - let bucket_selectivity = match comparison_op { - ComparisonOp::LT => (const_value - lower_bound) / bucket_range, - ComparisonOp::LTE => { - if const_value == lower_bound { - 1.0 / ndv - } else { - (const_value - lower_bound + 1.0) / bucket_range - } + Some(histogram) => { + let mut num_selected = 0.0; + for bucket in histogram.buckets_iter() { + let lower_bound = bucket.lower_bound(); + let upper_bound = bucket.upper_bound(); + + if !const_datum.can_compare(lower_bound) { + return Ok(DEFAULT_SELECTIVITY); } - ComparisonOp::GT => { - if const_value == lower_bound { - 1.0 - 1.0 / ndv - } else { - (upper_bound - const_value - 1.0).max(0.0) / bucket_range - } + + let const_gte_upper_bound = matches!( + const_datum.compare(upper_bound)?, + Ordering::Greater | Ordering::Equal + ); + let (no_overlap, complete_overlap) = match comparison_op { + ComparisonOp::LT => ( + matches!( + const_datum.compare(lower_bound)?, + Ordering::Less | Ordering::Equal + ), + const_gte_upper_bound, + ), + ComparisonOp::LTE => ( + matches!(const_datum.compare(lower_bound)?, Ordering::Less), + const_gte_upper_bound, + ), + ComparisonOp::GT => ( + const_gte_upper_bound, + matches!(const_datum.compare(lower_bound)?, Ordering::Less), + ), + ComparisonOp::GTE => ( + const_gte_upper_bound, + matches!( + const_datum.compare(lower_bound)?, + Ordering::Less | Ordering::Equal + ), + ), + _ => unreachable!(), + }; + + if complete_overlap { + num_selected += bucket.num_values(); + } else if !no_overlap && const_datum.is_numeric() { + let ndv = bucket.num_distinct(); + let lower_bound = lower_bound.to_double()?; + let upper_bound = upper_bound.to_double()?; + let const_value = const_datum.to_double()?; + + let bucket_range = upper_bound - lower_bound; + let bucket_selectivity = match comparison_op { + ComparisonOp::LT => (const_value - lower_bound) / bucket_range, + ComparisonOp::LTE => { + if const_value == lower_bound { + 1.0 / ndv + } else { + (const_value - lower_bound + 1.0) / bucket_range + } + } + ComparisonOp::GT => { + if const_value == lower_bound { + 1.0 - 1.0 / ndv + } else { + (upper_bound - const_value - 1.0).max(0.0) / bucket_range + } + } + ComparisonOp::GTE => (upper_bound - const_value) / bucket_range, + _ => unreachable!(), + }; + num_selected += bucket.num_values() * bucket_selectivity; } - ComparisonOp::GTE => (upper_bound - const_value) / bucket_range, - _ => unreachable!(), - }; - num_selected += bucket.num_values() * bucket_selectivity; - } - } + } - let selectivity = num_selected / histogram.num_values(); + num_selected / histogram.num_values() + } + }; if update { let (new_min, new_max) = match comparison_op { @@ -565,54 +526,20 @@ fn is_true_constant_predicate(constant: &ConstantExpr) -> bool { } } -fn evaluate_equal(column_stat: &ColumnStat, constant: &ConstantExpr) -> f64 { - let constant_datum = Datum::from_scalar(constant.value.clone()); - match constant.value.as_ref().infer_data_type() { - DataType::Null => 0.0, - DataType::Number(number) => match number { - NumberDataType::UInt8 - | NumberDataType::UInt16 - | NumberDataType::UInt32 - | NumberDataType::UInt64 - | NumberDataType::Int8 - | NumberDataType::Int16 - | NumberDataType::Int32 - | NumberDataType::Int64 - | NumberDataType::Float32 - | NumberDataType::Float64 => compare_equal(&constant_datum, column_stat), - }, - DataType::Boolean | DataType::Binary | DataType::String => { - compare_equal(&constant_datum, column_stat) - } +fn evaluate_equal(column_stat: &ColumnStat, not_eq: bool, constant: &ConstantExpr) -> f64 { + match &constant.value { + Scalar::Null => return if not_eq { 1.0 } else { 0.0 }, _ => { - if column_stat.ndv == 0.0 { - 0.0 - } else { - 1.0 / column_stat.ndv - } - } - } -} - -fn compare_equal(datum: &Option, column_stat: &ColumnStat) -> f64 { - let col_min = &column_stat.min; - let col_max = &column_stat.max; - if let Some(constant_datum) = datum { - if col_min.type_comparable(constant_datum) { - // Safe to unwrap, because type is comparable. - if constant_datum.compare(col_min).unwrap() == Ordering::Less - || constant_datum.compare(col_max).unwrap() == Ordering::Greater + if let Some(constant) = Datum::from_scalar(constant.value.clone()) + && (matches!(constant.compare(&column_stat.min), Ok(Ordering::Less)) + || matches!(constant.compare(&column_stat.max), Ok(Ordering::Greater))) { - return 0.0; + return if not_eq { 1.0 } else { 0.0 }; } } } - if column_stat.ndv == 0.0 { - 0.0 - } else { - 1.0 / column_stat.ndv - } + column_stat.ndv.equal_selectivity(not_eq) } fn update_statistic( @@ -621,8 +548,8 @@ fn update_statistic( mut new_max: Datum, selectivity: f64, ) -> Result<()> { - let new_ndv = (column_stat.ndv * selectivity).ceil(); - column_stat.ndv = new_ndv; + column_stat.ndv = column_stat.ndv.reduce_by_selectivity(selectivity); + if matches!( new_min, Datum::Bool(_) | Datum::Int(_) | Datum::UInt(_) | Datum::Float(_) @@ -642,7 +569,7 @@ fn update_statistic( } else { let num_values = histogram.num_values(); let new_num_values = (num_values * selectivity).ceil() as u64; - let new_ndv = new_ndv as u64; + let new_ndv = column_stat.ndv.value() as u64; if new_ndv <= 2 { column_stat.histogram = None; return Ok(()); diff --git a/src/query/sql/src/planner/plans/aggregate.rs b/src/query/sql/src/planner/plans/aggregate.rs index 1c3dd73d9c035..0edb1b45d4650 100644 --- a/src/query/sql/src/planner/plans/aggregate.rs +++ b/src/query/sql/src/planner/plans/aggregate.rs @@ -23,6 +23,7 @@ use crate::ColumnSet; use crate::IndexType; use crate::ScalarExpr; use crate::optimizer::ir::Distribution; +use crate::optimizer::ir::Ndv; use crate::optimizer::ir::PhysicalProperty; use crate::optimizer::ir::RelExpr; use crate::optimizer::ir::RelationalProperty; @@ -149,65 +150,81 @@ impl Aggregate { } pub fn derive_agg_stats(&self, stat_info: Arc) -> Result> { - let (cardinality, mut statistics) = (stat_info.cardinality, stat_info.statistics.clone()); - let cardinality = if self.group_items.is_empty() { - // Scalar aggregation - 1.0 - } else if self - .group_items - .iter() - .any(|item| !statistics.column_stats.contains_key(&item.index)) - { - (cardinality * DEFAULT_AGGREGATE_RATIO).max(1_f64) - } else { - // A upper bound - let mut res = self - .group_items - .first() - .map(|item| { - let item_stat = statistics.column_stats.get(&item.index).unwrap(); - item_stat.ndv + let column_stats = &stat_info.statistics.column_stats; + if self.group_items.is_empty() { + return Ok(Arc::new(StatInfo { + cardinality: 1.0, + statistics: Statistics { + precise_cardinality: Some(1), + column_stats: column_stats.clone(), + }, + })); + } + + if self.group_items.iter().any(|item| { + column_stats + .get(&item.index) + .map(|stat| { + if let Ndv::Max(ndv) = stat.ndv { + ndv >= stat_info.cardinality + } else { + false + } }) - .unwrap_or(1.0); + .unwrap_or(true) + }) { + return Ok(Arc::new(StatInfo { + cardinality: (stat_info.cardinality * DEFAULT_AGGREGATE_RATIO).max(1.0), + statistics: Statistics { + precise_cardinality: None, + column_stats: column_stats.clone(), + }, + })); + } - for (idx, item) in self.group_items.iter().skip(1).enumerate() { - let item_stat = statistics.column_stats.get(&item.index).unwrap(); - let ndv = item_stat.ndv * AGGREGATE_COLUMN_CORRELATION_COEFFICIENT.powi(idx as i32); - res *= ndv.max(1_f64); + let groups_ndv = self + .group_items + .iter() + .map(|group| column_stats[&group.index].ndv.value()) + .collect::>(); + + let cardinality = groups_ndv + .into_iter() + .enumerate() + .fold(1.0, |acc, (i, ndv)| { + let x = if i > 1 { + ndv * AGGREGATE_COLUMN_CORRELATION_COEFFICIENT.powi((i - 1) as i32) + } else { + ndv + }; + acc * x.max(1.0) + }) + .min(stat_info.cardinality); + + let mut column_stats = stat_info.statistics.column_stats.clone(); + for item in self.group_items.iter() { + let item_stat = column_stats.get_mut(&item.index).unwrap(); + if self.group_items.len() == 1 { + item_stat.ndv = item_stat.ndv.reduce(cardinality); } - for item in self.group_items.iter() { - let item_stat = statistics.column_stats.get_mut(&item.index).unwrap(); - if let Some(histogram) = &mut item_stat.histogram { - let mut num_values = 0.0; - let mut num_distinct = 0.0; - for bucket in histogram.buckets.iter() { - num_distinct += bucket.num_distinct(); - num_values += bucket.num_values(); - } - // When there is a high probability that eager aggregation - // is better, we will update the histogram. - if num_values / num_distinct >= 10.0 { - for bucket in histogram.buckets.iter_mut() { - bucket.aggregate_values(); - } - } + let Some(histogram) = &mut item_stat.histogram else { + continue; + }; + // When there is a high probability that eager aggregation + // is better, we will update the histogram. + if histogram.num_values() >= histogram.num_distinct_values() * 10.0 { + for bucket in histogram.buckets.iter_mut() { + bucket.aggregate_values(); } } - // To avoid res is very large - f64::min(res, cardinality) - }; + } - let precise_cardinality = if self.group_items.is_empty() { - Some(1) - } else { - None - }; Ok(Arc::new(StatInfo { cardinality, statistics: Statistics { - precise_cardinality, - column_stats: statistics.column_stats, + precise_cardinality: None, + column_stats, }, })) } diff --git a/src/query/sql/src/planner/plans/constant_table_scan.rs b/src/query/sql/src/planner/plans/constant_table_scan.rs index e3b29ca1c530d..f6d34ba485864 100644 --- a/src/query/sql/src/planner/plans/constant_table_scan.rs +++ b/src/query/sql/src/planner/plans/constant_table_scan.rs @@ -33,6 +33,7 @@ use crate::optimizer::ir::ColumnStat; use crate::optimizer::ir::ColumnStatSet; use crate::optimizer::ir::Distribution; use crate::optimizer::ir::HistogramBuilder; +use crate::optimizer::ir::Ndv; use crate::optimizer::ir::PhysicalProperty; use crate::optimizer::ir::RelExpr; use crate::optimizer::ir::RelationalProperty; @@ -223,7 +224,7 @@ impl Operator for ConstantTableScan { let column_stat = ColumnStat { min, max, - ndv: ndv as f64, + ndv: Ndv::Stat(ndv as _), null_count, histogram, }; diff --git a/src/query/sql/src/planner/plans/join.rs b/src/query/sql/src/planner/plans/join.rs index 1d26198edb9b6..9fc13f49bc75a 100644 --- a/src/query/sql/src/planner/plans/join.rs +++ b/src/query/sql/src/planner/plans/join.rs @@ -30,7 +30,7 @@ use crate::IndexType; use crate::optimizer::ir::ColumnStat; use crate::optimizer::ir::Distribution; use crate::optimizer::ir::HistogramBuilder; -use crate::optimizer::ir::NewStatistic; +use crate::optimizer::ir::Ndv; use crate::optimizer::ir::PhysicalProperty; use crate::optimizer::ir::RelExpr; use crate::optimizer::ir::RelationalProperty; @@ -323,10 +323,10 @@ impl Join { let mut join_card_updated = false; let mut left_column_index = 0; let mut right_column_index = 0; - for condition in self.equi_conditions.iter() { + for condition in &self.equi_conditions { let left_condition = &condition.left; let right_condition = &condition.right; - if join_card == 0 as f64 { + if join_card == 0.0 { break; } // Currently don't consider the case such as: `t1.a + t1.b = t2.a` @@ -334,143 +334,114 @@ impl Join { { continue; } - let left_col_stat = left_statistics + let Some(left_col_stat) = left_statistics .column_stats - .get(left_condition.used_columns().iter().next().unwrap()); - let right_col_stat = right_statistics + .get(left_condition.used_columns().iter().next().unwrap()) + else { + continue; + }; + let Some(right_col_stat) = right_statistics .column_stats - .get(right_condition.used_columns().iter().next().unwrap()); - match (left_col_stat, right_col_stat) { - (Some(left_col_stat), Some(right_col_stat)) => { - if !left_col_stat.min.type_comparable(&right_col_stat.min) { - continue; - } - let left_interval = - UniformSampleSet::new(left_col_stat.min.clone(), left_col_stat.max.clone()); - let right_interval = UniformSampleSet::new( - right_col_stat.min.clone(), - right_col_stat.max.clone(), - ); - if !left_interval.has_intersection(&right_interval)? { - join_card = 0.0; - continue; - } + .get(right_condition.used_columns().iter().next().unwrap()) + else { + continue; + }; - // Update column min and max value - let mut new_ndv = None; - let (new_min, new_max) = left_interval.intersection(&right_interval)?; - - if let Datum::Bytes(_) | Datum::Bool(_) = left_col_stat.min { - let card = evaluate_by_ndv( - left_col_stat, - right_col_stat, - *left_cardinality, - *right_cardinality, - &mut new_ndv, - ); - let (left_index, right_index) = update_statistic( - left_statistics, - right_statistics, - left_condition, - right_condition, - NewStatistic { - min: new_min, - max: new_max, - ndv: new_ndv, - }, - ); - if card < join_card { - join_card = card; - join_card_updated = true; - left_column_index = left_index; - right_column_index = right_index; - } - continue; - } - let card = match (&left_col_stat.histogram, &right_col_stat.histogram) { - (Some(left_hist), Some(right_hist)) => { - // Evaluate join cardinality by histogram. - evaluate_by_histogram(left_hist, right_hist, &mut new_ndv)? - } - _ => evaluate_by_ndv( - left_col_stat, - right_col_stat, - *left_cardinality, - *right_cardinality, - &mut new_ndv, - ), - }; - let (left_index, right_index) = update_statistic( - left_statistics, - right_statistics, - left_condition, - right_condition, - NewStatistic { - min: new_min, - max: new_max, - ndv: new_ndv, - }, - ); - if card < join_card { - join_card = card; - join_card_updated = true; - left_column_index = left_index; - right_column_index = right_index; - } + if !left_col_stat.min.type_comparable(&right_col_stat.min) { + continue; + } + let left_interval = + UniformSampleSet::new(left_col_stat.min.clone(), left_col_stat.max.clone()); + let right_interval = + UniformSampleSet::new(right_col_stat.min.clone(), right_col_stat.max.clone()); + if !left_interval.has_intersection(&right_interval)? { + join_card = 0.0; + continue; + } + + // Update column min and max value + let mut new_ndv = None; + let (new_min, new_max) = left_interval.intersection(&right_interval)?; + let card = match (&left_col_stat.histogram, &right_col_stat.histogram) { + (Some(left_hist), Some(right_hist)) + if matches!( + left_col_stat.min, + Datum::Int(_) | Datum::UInt(_) | Datum::Float(_) + ) => + { + // Evaluate join cardinality by histogram. + evaluate_by_histogram(left_hist, right_hist, &mut new_ndv)? } - _ => continue, + _ => evaluate_by_ndv( + left_col_stat, + right_col_stat, + *left_cardinality, + *right_cardinality, + &mut new_ndv, + ), + }; + + let (left_index, right_index) = update_statistic( + left_statistics, + right_statistics, + left_condition, + right_condition, + NewStatistic { + min: new_min, + max: new_max, + ndv: new_ndv, + }, + ); + if card < join_card { + join_card = card; + join_card_updated = true; + left_column_index = left_index; + right_column_index = right_index; } } - if join_card_updated { - for (idx, left) in left_statistics.column_stats.iter_mut() { - if *idx == left_column_index { - if left.histogram.is_some() { - // Todo: find a better way to update accuracy histogram - left.histogram = if left.ndv as u64 <= 2 { - None - } else { - if matches!(left.min, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_)) - { - left.min = Datum::Float(F64::from(left.min.to_double()?)); - left.max = Datum::Float(F64::from(left.max.to_double()?)); - } - Some(HistogramBuilder::from_ndv( - left.ndv as u64, - max(join_card as u64, left.ndv as u64), - Some((left.min.clone(), left.max.clone())), - DEFAULT_HISTOGRAM_BUCKETS, - )?) - }; - } - continue; - } + if !join_card_updated { + return Ok(join_card); + } + + for (idx, left) in left_statistics.column_stats.iter_mut() { + if *idx != left_column_index || left.histogram.is_none() || left.ndv.value() as u64 <= 2 + { // Other columns' histograms are inaccurate, so make them None left.histogram = None; + continue; } - for (idx, right) in right_statistics.column_stats.iter_mut() { - if *idx == right_column_index { - if right.histogram.is_some() { - // Todo: find a better way to update accuracy histogram - right.histogram = if right.ndv as u64 <= 2 { - None - } else { - if matches!(right.min, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_)) - { - right.min = Datum::Float(F64::from(right.min.to_double()?)); - right.max = Datum::Float(F64::from(right.max.to_double()?)); - } - Some(HistogramBuilder::from_ndv( - right.ndv as u64, - max(join_card as u64, right.ndv as u64), - Some((right.min.clone(), right.max.clone())), - DEFAULT_HISTOGRAM_BUCKETS, - )?) - }; - } - continue; - } + // Todo: find a better way to update accuracy histogram + if matches!(left.min, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_)) { + left.min = Datum::Float(F64::from(left.min.to_double()?)); + left.max = Datum::Float(F64::from(left.max.to_double()?)); + } + left.histogram = Some(HistogramBuilder::from_ndv( + left.ndv.value() as u64, + max(join_card as u64, left.ndv.value() as u64), + Some((left.min.clone(), left.max.clone())), + DEFAULT_HISTOGRAM_BUCKETS, + )?); + } + + for (idx, right) in right_statistics.column_stats.iter_mut() { + if *idx != right_column_index + || right.histogram.is_none() + || right.ndv.value() as u64 <= 2 + { right.histogram = None; + continue; } + // Todo: find a better way to update accuracy histogram + if matches!(right.min, Datum::Int(_) | Datum::UInt(_) | Datum::Float(_)) { + right.min = Datum::Float(F64::from(right.min.to_double()?)); + right.max = Datum::Float(F64::from(right.max.to_double()?)); + } + right.histogram = Some(HistogramBuilder::from_ndv( + right.ndv.value() as u64, + max(join_card as u64, right.ndv.value() as u64), + Some((right.min.clone(), right.max.clone())), + DEFAULT_HISTOGRAM_BUCKETS, + )?); } Ok(join_card) } @@ -526,10 +497,15 @@ impl Join { let column_stats = if cardinality == 0.0 { HashMap::new() } else { - let mut column_stats = HashMap::new(); - column_stats.extend(left_statistics.column_stats); - column_stats.extend(right_statistics.column_stats); - column_stats + left_statistics + .column_stats + .into_iter() + .chain(right_statistics.column_stats) + .map(|(col, mut stat)| { + stat.ndv = stat.ndv.reduce(cardinality); + (col, stat) + }) + .collect() }; Ok(Arc::new(StatInfo { cardinality, @@ -987,10 +963,21 @@ fn evaluate_by_ndv( right_cardinality: f64, new_ndv: &mut Option, ) -> f64 { - // Update column ndv - *new_ndv = Some(left_stat.ndv.min(right_stat.ndv)); - - let max_ndv = f64::max(left_stat.ndv, right_stat.ndv); + let max_ndv = match (left_stat.ndv, right_stat.ndv) { + (Ndv::Stat(a), Ndv::Stat(b)) => { + *new_ndv = Some(f64::min(a, b)); + f64::max(a, b) + } + (Ndv::Stat(stat), Ndv::Max(max)) | (Ndv::Max(max), Ndv::Stat(stat)) => { + *new_ndv = Some(f64::min(stat, max)); + stat + } + (Ndv::Max(0.0), Ndv::Max(0.0)) => { + *new_ndv = Some(0.0); + 0.0 + } + _ => left_cardinality * right_cardinality, + }; if max_ndv == 0.0 { 0.0 } else { @@ -1009,6 +996,7 @@ fn update_statistic( let right_index = *right_condition.used_columns().iter().next().unwrap(); let left_col_stat = left_statistics.column_stats.get_mut(&left_index).unwrap(); let right_col_stat = right_statistics.column_stats.get_mut(&right_index).unwrap(); + if let Some(new_min) = new_stat.min { left_col_stat.min = new_min.clone(); right_col_stat.min = new_min; @@ -1018,8 +1006,15 @@ fn update_statistic( right_col_stat.max = new_max; } if let Some(new_ndv) = new_stat.ndv { - left_col_stat.ndv = new_ndv; - right_col_stat.ndv = new_ndv; + left_col_stat.ndv = Ndv::Stat(new_ndv); + right_col_stat.ndv = Ndv::Stat(new_ndv); } (left_index, right_index) } + +#[derive(Debug, Clone)] +struct NewStatistic { + min: Option, + max: Option, + ndv: Option, +} diff --git a/src/query/sql/src/planner/plans/scan.rs b/src/query/sql/src/planner/plans/scan.rs index 123f174e56d55..c46666c2f153a 100644 --- a/src/query/sql/src/planner/plans/scan.rs +++ b/src/query/sql/src/planner/plans/scan.rs @@ -38,6 +38,7 @@ use crate::optimizer::ir::ColumnStatSet; use crate::optimizer::ir::Distribution; use crate::optimizer::ir::HistogramBuilder; use crate::optimizer::ir::MAX_SELECTIVITY; +use crate::optimizer::ir::Ndv; use crate::optimizer::ir::PhysicalProperty; use crate::optimizer::ir::RelExpr; use crate::optimizer::ir::RelationalProperty; @@ -255,8 +256,7 @@ impl Operator for Scan { .statistics .table_stats .as_ref() - .map(|s| s.num_rows.unwrap_or(0)) - .unwrap_or(0); + .and_then(|s| s.num_rows); let mut column_stats: ColumnStatSet = Default::default(); for (k, v) in &self.statistics.column_stats { @@ -265,31 +265,31 @@ impl Operator for Scan { continue; } if let Some(col_stat) = v.clone() { - // Safe to unwrap: min, max are all `Some(_)`. let Some(min) = col_stat.min.clone() else { continue; }; let Some(max) = col_stat.max.clone() else { continue; }; - // ndv could be `None`, we will use `num_rows - null_count` as ndv instead. - // + // NOTE: don't touch the original num_rows, since it will be used in other places. - let mut ndv = col_stat - .ndv - .unwrap_or_else(|| num_rows.saturating_sub(col_stat.null_count)); + let ndv = match col_stat.ndv { + Some(ndv) => Ndv::Stat(ndv as f64), + None => Ndv::Max( + num_rows + .and_then(|n| n.checked_sub(col_stat.null_count)) + .unwrap_or(u64::MAX) as _, + ), + }; // Alter ndv based on min and max if the datum is uint or int. - match (&max, &min) { - (Datum::UInt(m), Datum::UInt(n)) if m >= n => ndv = ndv.min(m - n + 1), + let ndv = match (&max, &min) { + (Datum::UInt(m), Datum::UInt(n)) if m >= n => ndv.reduce((m - n + 1) as _), (Datum::Int(m), Datum::Int(n)) if m >= n => { - ndv = ndv.min(m.saturating_add(1).saturating_sub(*n) as u64) - } - _ => { - if max == min { - ndv = 1 - } + ndv.reduce(m.saturating_add(1).saturating_sub(*n) as _) } + _ if max == min => Ndv::Stat(1.0), + _ => ndv, }; let histogram = if let Some(histogram) = self.statistics.histograms.get(k) @@ -297,28 +297,28 @@ impl Operator for Scan { { histogram.clone() } else { - let num_rows = num_rows.saturating_sub(col_stat.null_count); - let ndv = std::cmp::min(num_rows, ndv); - if num_rows != 0 { + num_rows.and_then(|num_rows| { + let num_rows = num_rows.saturating_sub(col_stat.null_count); + if num_rows == 0 { + return None; + } + let Ndv::Stat(ndv) = ndv else { return None }; HistogramBuilder::from_ndv( - ndv, + ndv as _, num_rows, Some((min.clone(), max.clone())), DEFAULT_HISTOGRAM_BUCKETS, ) .ok() - } else { - None - } + }) }; - let column_stat = ColumnStat { + column_stats.insert(*k, ColumnStat { min, max, - ndv: ndv as f64, + ndv, null_count: col_stat.null_count, histogram, - }; - column_stats.insert(*k as IndexType, column_stat); + }); } } diff --git a/tests/sqllogictests/suites/mode/cluster/explain_v2.test b/tests/sqllogictests/suites/mode/cluster/explain_v2.test index 9bc8f98bdf575..1e298e1502ade 100644 --- a/tests/sqllogictests/suites/mode/cluster/explain_v2.test +++ b/tests/sqllogictests/suites/mode/cluster/explain_v2.test @@ -148,12 +148,12 @@ EvalScalar ├── probe keys: [] ├── other filters: [] ├── Scan - │ ├── table: default.t1 (#0) + │ ├── table: default.t2 (#1) │ ├── filters: [] │ ├── order by: [] │ └── limit: NONE └── Scan - ├── table: default.t2 (#1) + ├── table: default.t1 (#0) ├── filters: [] ├── order by: [] └── limit: NONE @@ -167,18 +167,18 @@ EvalScalar ├── build keys: [t2.a (#2), t2.b (#3)] ├── probe keys: [t1.a (#0), t1.b (#1)] ├── other filters: [] - ├── Filter - │ ├── filters: [gt(t1.a (#0), 2)] - │ └── Scan - │ ├── table: default.t1 (#0) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Scan - ├── table: default.t2 (#1) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Scan + │ ├── table: default.t2 (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Filter + ├── filters: [gt(t1.a (#0), 2)] + └── Scan + ├── table: default.t1 (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE query T explain select count(1) as c, count(b) as d, max(a) as e from t1 order by c, e, d limit 10; diff --git a/tests/sqllogictests/suites/mode/standalone/explain/explain.test b/tests/sqllogictests/suites/mode/standalone/explain/explain.test index a55b4c5e4893c..398f13037d4a5 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/explain.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/explain.test @@ -139,12 +139,12 @@ EvalScalar ├── probe keys: [] ├── other filters: [] ├── Scan - │ ├── table: default.t1 (#0) + │ ├── table: default.t2 (#1) │ ├── filters: [] │ ├── order by: [] │ └── limit: NONE └── Scan - ├── table: default.t2 (#1) + ├── table: default.t1 (#0) ├── filters: [] ├── order by: [] └── limit: NONE @@ -158,18 +158,18 @@ EvalScalar ├── build keys: [t2.a (#2), t2.b (#3)] ├── probe keys: [t1.a (#0), t1.b (#1)] ├── other filters: [] - ├── Filter - │ ├── filters: [gt(t1.a (#0), 2)] - │ └── Scan - │ ├── table: default.t1 (#0) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Scan - ├── table: default.t2 (#1) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Scan + │ ├── table: default.t2 (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Filter + ├── filters: [gt(t1.a (#0), 2)] + └── Scan + ├── table: default.t1 (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE query T explain syntax select 1, 'ab', [1,2,3], (1, 'a') @@ -1148,12 +1148,12 @@ Sort ├── build keys: [t2.k (#2)] ├── probe keys: [t1.i (#0)] ├── other filters: [] - ├── Scan - │ ├── table: default.t1 (#0) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── EmptyResultScan + ├── EmptyResultScan + └── Scan + ├── table: default.t1 (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE statement ok diff --git a/tests/sqllogictests/suites/mode/standalone/explain/explain_verbose.test b/tests/sqllogictests/suites/mode/standalone/explain/explain_verbose.test index 44a8ea7592265..ed65d53c2e734 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/explain_verbose.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/explain_verbose.test @@ -95,169 +95,169 @@ Join(Cross) │ ├── t.b (#5): { min: 1, max: 1000, ndv: 1, null count: 0 } │ ├── t.b (#7): { min: 1, max: 1000, ndv: 1, null count: 0 } │ └── t.b (#9): { min: 1, max: 1000, ndv: 1, null count: 0 } -├── Join(Cross) -│ ├── build keys: [] -│ ├── probe keys: [] -│ ├── other filters: [] -│ ├── output columns: [t.a (#0), t.a (#2), t.a (#4), t.a (#6), t.b (#1), t.b (#3), t.b (#5), t.b (#7)] +├── Filter +│ ├── filters: [eq(t.a (#8), 1)] +│ ├── output columns: [t.a (#8), t.b (#9)] │ ├── outer columns: [] -│ ├── used columns: [t.a (#0), t.a (#2), t.a (#4), t.a (#6), t.b (#1), t.b (#3), t.b (#5), t.b (#7)] -│ ├── cardinality: 1.172 +│ ├── used columns: [t.a (#8), t.b (#9)] +│ ├── cardinality: 1.041 │ ├── precise cardinality: N/A │ ├── statistics -│ │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ ├── t.a (#4): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ ├── t.a (#6): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ ├── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ ├── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ ├── t.b (#5): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ └── t.b (#7): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ ├── Join(Cross) -│ │ ├── build keys: [] -│ │ ├── probe keys: [] -│ │ ├── other filters: [] -│ │ ├── output columns: [t.a (#0), t.a (#2), t.a (#4), t.b (#1), t.b (#3), t.b (#5)] -│ │ ├── outer columns: [] -│ │ ├── used columns: [t.a (#0), t.a (#2), t.a (#4), t.b (#1), t.b (#3), t.b (#5)] -│ │ ├── cardinality: 1.127 -│ │ ├── precise cardinality: N/A -│ │ ├── statistics -│ │ │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ ├── t.a (#4): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ ├── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ │ ├── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ │ └── t.b (#5): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ ├── Join(Cross) -│ │ │ ├── build keys: [] -│ │ │ ├── probe keys: [] -│ │ │ ├── other filters: [] -│ │ │ ├── output columns: [t.a (#0), t.a (#2), t.b (#1), t.b (#3)] -│ │ │ ├── outer columns: [] -│ │ │ ├── used columns: [t.a (#0), t.a (#2), t.b (#1), t.b (#3)] -│ │ │ ├── cardinality: 1.083 -│ │ │ ├── precise cardinality: N/A -│ │ │ ├── statistics -│ │ │ │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ │ ├── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ │ │ └── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ │ ├── Filter -│ │ │ │ ├── filters: [eq(t.a (#0), 1)] -│ │ │ │ ├── output columns: [t.a (#0), t.b (#1)] -│ │ │ │ ├── outer columns: [] -│ │ │ │ ├── used columns: [t.a (#0), t.b (#1)] -│ │ │ │ ├── cardinality: 1.041 -│ │ │ │ ├── precise cardinality: N/A -│ │ │ │ ├── statistics -│ │ │ │ │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ │ │ └── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ │ │ └── Scan -│ │ │ │ ├── table: testdb.t (#0) -│ │ │ │ ├── filters: [eq(t.a (#0), 1)] -│ │ │ │ ├── order by: [] -│ │ │ │ ├── limit: NONE -│ │ │ │ ├── output columns: [t.a (#0), t.b (#1)] -│ │ │ │ ├── outer columns: [] -│ │ │ │ ├── used columns: [t.a (#0), t.b (#1)] -│ │ │ │ ├── cardinality: 1000.000 -│ │ │ │ ├── precise cardinality: 1000 -│ │ │ │ └── statistics -│ │ │ │ ├── t.a (#0): { min: 0, max: 999, ndv: 961, null count: 0 } -│ │ │ │ └── t.b (#1): { min: 1, max: 1000, ndv: 961, null count: 0 } -│ │ │ └── Filter -│ │ │ ├── filters: [eq(t.a (#2), 1)] -│ │ │ ├── output columns: [t.a (#2), t.b (#3)] -│ │ │ ├── outer columns: [] -│ │ │ ├── used columns: [t.a (#2), t.b (#3)] -│ │ │ ├── cardinality: 1.041 -│ │ │ ├── precise cardinality: N/A -│ │ │ ├── statistics -│ │ │ │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ │ └── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ │ └── Scan -│ │ │ ├── table: testdb.t (#1) -│ │ │ ├── filters: [eq(t.a (#2), 1)] -│ │ │ ├── order by: [] -│ │ │ ├── limit: NONE -│ │ │ ├── output columns: [t.a (#2), t.b (#3)] -│ │ │ ├── outer columns: [] -│ │ │ ├── used columns: [t.a (#2), t.b (#3)] -│ │ │ ├── cardinality: 1000.000 -│ │ │ ├── precise cardinality: 1000 -│ │ │ └── statistics -│ │ │ ├── t.a (#2): { min: 0, max: 999, ndv: 961, null count: 0 } -│ │ │ └── t.b (#3): { min: 1, max: 1000, ndv: 961, null count: 0 } -│ │ └── Filter -│ │ ├── filters: [eq(t.a (#4), 1)] -│ │ ├── output columns: [t.a (#4), t.b (#5)] -│ │ ├── outer columns: [] -│ │ ├── used columns: [t.a (#4), t.b (#5)] -│ │ ├── cardinality: 1.041 -│ │ ├── precise cardinality: N/A -│ │ ├── statistics -│ │ │ ├── t.a (#4): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ │ └── t.b (#5): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ │ └── Scan -│ │ ├── table: testdb.t (#2) -│ │ ├── filters: [eq(t.a (#4), 1)] -│ │ ├── order by: [] -│ │ ├── limit: NONE -│ │ ├── output columns: [t.a (#4), t.b (#5)] -│ │ ├── outer columns: [] -│ │ ├── used columns: [t.a (#4), t.b (#5)] -│ │ ├── cardinality: 1000.000 -│ │ ├── precise cardinality: 1000 -│ │ └── statistics -│ │ ├── t.a (#4): { min: 0, max: 999, ndv: 961, null count: 0 } -│ │ └── t.b (#5): { min: 1, max: 1000, ndv: 961, null count: 0 } -│ └── Filter -│ ├── filters: [eq(t.a (#6), 1)] -│ ├── output columns: [t.a (#6), t.b (#7)] +│ │ ├── t.a (#8): { min: 1, max: 1, ndv: 1, null count: 0 } +│ │ └── t.b (#9): { min: 1, max: 1000, ndv: 1, null count: 0 } +│ └── Scan +│ ├── table: testdb.t (#4) +│ ├── filters: [eq(t.a (#8), 1)] +│ ├── order by: [] +│ ├── limit: NONE +│ ├── output columns: [t.a (#8), t.b (#9)] │ ├── outer columns: [] -│ ├── used columns: [t.a (#6), t.b (#7)] -│ ├── cardinality: 1.041 -│ ├── precise cardinality: N/A -│ ├── statistics -│ │ ├── t.a (#6): { min: 1, max: 1, ndv: 1, null count: 0 } -│ │ └── t.b (#7): { min: 1, max: 1000, ndv: 1, null count: 0 } -│ └── Scan -│ ├── table: testdb.t (#3) -│ ├── filters: [eq(t.a (#6), 1)] -│ ├── order by: [] -│ ├── limit: NONE -│ ├── output columns: [t.a (#6), t.b (#7)] -│ ├── outer columns: [] -│ ├── used columns: [t.a (#6), t.b (#7)] -│ ├── cardinality: 1000.000 -│ ├── precise cardinality: 1000 -│ └── statistics -│ ├── t.a (#6): { min: 0, max: 999, ndv: 961, null count: 0 } -│ └── t.b (#7): { min: 1, max: 1000, ndv: 961, null count: 0 } -└── Filter - ├── filters: [eq(t.a (#8), 1)] - ├── output columns: [t.a (#8), t.b (#9)] +│ ├── used columns: [t.a (#8), t.b (#9)] +│ ├── cardinality: 1000.000 +│ ├── precise cardinality: 1000 +│ └── statistics +│ ├── t.a (#8): { min: 0, max: 999, ndv: 961, null count: 0 } +│ └── t.b (#9): { min: 1, max: 1000, ndv: 961, null count: 0 } +└── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── output columns: [t.a (#0), t.a (#2), t.a (#4), t.a (#6), t.b (#1), t.b (#3), t.b (#5), t.b (#7)] ├── outer columns: [] - ├── used columns: [t.a (#8), t.b (#9)] - ├── cardinality: 1.041 + ├── used columns: [t.a (#0), t.a (#2), t.a (#4), t.a (#6), t.b (#1), t.b (#3), t.b (#5), t.b (#7)] + ├── cardinality: 1.172 ├── precise cardinality: N/A ├── statistics - │ ├── t.a (#8): { min: 1, max: 1, ndv: 1, null count: 0 } - │ └── t.b (#9): { min: 1, max: 1000, ndv: 1, null count: 0 } - └── Scan - ├── table: testdb.t (#4) - ├── filters: [eq(t.a (#8), 1)] - ├── order by: [] - ├── limit: NONE - ├── output columns: [t.a (#8), t.b (#9)] + │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.a (#4): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.a (#6): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ ├── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ ├── t.b (#5): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ └── t.b (#7): { min: 1, max: 1000, ndv: 1, null count: 0 } + ├── Filter + │ ├── filters: [eq(t.a (#6), 1)] + │ ├── output columns: [t.a (#6), t.b (#7)] + │ ├── outer columns: [] + │ ├── used columns: [t.a (#6), t.b (#7)] + │ ├── cardinality: 1.041 + │ ├── precise cardinality: N/A + │ ├── statistics + │ │ ├── t.a (#6): { min: 1, max: 1, ndv: 1, null count: 0 } + │ │ └── t.b (#7): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ └── Scan + │ ├── table: testdb.t (#3) + │ ├── filters: [eq(t.a (#6), 1)] + │ ├── order by: [] + │ ├── limit: NONE + │ ├── output columns: [t.a (#6), t.b (#7)] + │ ├── outer columns: [] + │ ├── used columns: [t.a (#6), t.b (#7)] + │ ├── cardinality: 1000.000 + │ ├── precise cardinality: 1000 + │ └── statistics + │ ├── t.a (#6): { min: 0, max: 999, ndv: 961, null count: 0 } + │ └── t.b (#7): { min: 1, max: 1000, ndv: 961, null count: 0 } + └── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── output columns: [t.a (#0), t.a (#2), t.a (#4), t.b (#1), t.b (#3), t.b (#5)] ├── outer columns: [] - ├── used columns: [t.a (#8), t.b (#9)] - ├── cardinality: 1000.000 - ├── precise cardinality: 1000 - └── statistics - ├── t.a (#8): { min: 0, max: 999, ndv: 961, null count: 0 } - └── t.b (#9): { min: 1, max: 1000, ndv: 961, null count: 0 } + ├── used columns: [t.a (#0), t.a (#2), t.a (#4), t.b (#1), t.b (#3), t.b (#5)] + ├── cardinality: 1.127 + ├── precise cardinality: N/A + ├── statistics + │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.a (#4): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ ├── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ └── t.b (#5): { min: 1, max: 1000, ndv: 1, null count: 0 } + ├── Filter + │ ├── filters: [eq(t.a (#4), 1)] + │ ├── output columns: [t.a (#4), t.b (#5)] + │ ├── outer columns: [] + │ ├── used columns: [t.a (#4), t.b (#5)] + │ ├── cardinality: 1.041 + │ ├── precise cardinality: N/A + │ ├── statistics + │ │ ├── t.a (#4): { min: 1, max: 1, ndv: 1, null count: 0 } + │ │ └── t.b (#5): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ └── Scan + │ ├── table: testdb.t (#2) + │ ├── filters: [eq(t.a (#4), 1)] + │ ├── order by: [] + │ ├── limit: NONE + │ ├── output columns: [t.a (#4), t.b (#5)] + │ ├── outer columns: [] + │ ├── used columns: [t.a (#4), t.b (#5)] + │ ├── cardinality: 1000.000 + │ ├── precise cardinality: 1000 + │ └── statistics + │ ├── t.a (#4): { min: 0, max: 999, ndv: 961, null count: 0 } + │ └── t.b (#5): { min: 1, max: 1000, ndv: 961, null count: 0 } + └── Join(Cross) + ├── build keys: [] + ├── probe keys: [] + ├── other filters: [] + ├── output columns: [t.a (#0), t.a (#2), t.b (#1), t.b (#3)] + ├── outer columns: [] + ├── used columns: [t.a (#0), t.a (#2), t.b (#1), t.b (#3)] + ├── cardinality: 1.083 + ├── precise cardinality: N/A + ├── statistics + │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } + │ ├── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ └── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } + ├── Filter + │ ├── filters: [eq(t.a (#2), 1)] + │ ├── output columns: [t.a (#2), t.b (#3)] + │ ├── outer columns: [] + │ ├── used columns: [t.a (#2), t.b (#3)] + │ ├── cardinality: 1.041 + │ ├── precise cardinality: N/A + │ ├── statistics + │ │ ├── t.a (#2): { min: 1, max: 1, ndv: 1, null count: 0 } + │ │ └── t.b (#3): { min: 1, max: 1000, ndv: 1, null count: 0 } + │ └── Scan + │ ├── table: testdb.t (#1) + │ ├── filters: [eq(t.a (#2), 1)] + │ ├── order by: [] + │ ├── limit: NONE + │ ├── output columns: [t.a (#2), t.b (#3)] + │ ├── outer columns: [] + │ ├── used columns: [t.a (#2), t.b (#3)] + │ ├── cardinality: 1000.000 + │ ├── precise cardinality: 1000 + │ └── statistics + │ ├── t.a (#2): { min: 0, max: 999, ndv: 961, null count: 0 } + │ └── t.b (#3): { min: 1, max: 1000, ndv: 961, null count: 0 } + └── Filter + ├── filters: [eq(t.a (#0), 1)] + ├── output columns: [t.a (#0), t.b (#1)] + ├── outer columns: [] + ├── used columns: [t.a (#0), t.b (#1)] + ├── cardinality: 1.041 + ├── precise cardinality: N/A + ├── statistics + │ ├── t.a (#0): { min: 1, max: 1, ndv: 1, null count: 0 } + │ └── t.b (#1): { min: 1, max: 1000, ndv: 1, null count: 0 } + └── Scan + ├── table: testdb.t (#0) + ├── filters: [eq(t.a (#0), 1)] + ├── order by: [] + ├── limit: NONE + ├── output columns: [t.a (#0), t.b (#1)] + ├── outer columns: [] + ├── used columns: [t.a (#0), t.b (#1)] + ├── cardinality: 1000.000 + ├── precise cardinality: 1000 + └── statistics + ├── t.a (#0): { min: 0, max: 999, ndv: 961, null count: 0 } + └── t.b (#1): { min: 1, max: 1000, ndv: 961, null count: 0 } query T explain(decorrelated,verbose) select a, exists(select * from numbers(1000) where number % 10 = t.b) mark from t; @@ -280,74 +280,74 @@ EvalScalar ├── cardinality: 0.000 ├── precise cardinality: N/A ├── statistics - ├── Scan - │ ├── table: testdb.t (#0) - │ ├── filters: [] - │ ├── order by: [] - │ ├── limit: NONE - │ ├── output columns: [t.a (#0), t.b (#1)] + ├── EvalScalar + │ ├── scalars: [numbers.number (#2) AS (#2), b (#5) AS (#5)] + │ ├── output columns: [b (#5), numbers.number (#2)] │ ├── outer columns: [] - │ ├── used columns: [t.a (#0), t.b (#1)] + │ ├── used columns: [a (#4), b (#5), numbers.number (#2)] │ ├── cardinality: 0.000 │ ├── precise cardinality: N/A - │ └── statistics - └── EvalScalar - ├── scalars: [numbers.number (#2) AS (#2), b (#5) AS (#5)] - ├── output columns: [b (#5), numbers.number (#2)] + │ ├── statistics + │ └── Filter + │ ├── filters: [eq(modulo(numbers.number (#2), 10), b (#5))] + │ ├── output columns: [b (#5), numbers.number (#2)] + │ ├── outer columns: [] + │ ├── used columns: [a (#4), b (#5), numbers.number (#2)] + │ ├── cardinality: 0.000 + │ ├── precise cardinality: N/A + │ ├── statistics + │ └── Join(Cross) + │ ├── build keys: [] + │ ├── probe keys: [] + │ ├── other filters: [] + │ ├── output columns: [b (#5), numbers.number (#2)] + │ ├── outer columns: [] + │ ├── used columns: [a (#4), b (#5), numbers.number (#2)] + │ ├── cardinality: 0.000 + │ ├── precise cardinality: N/A + │ ├── statistics + │ ├── Scan + │ │ ├── table: system.numbers (#1) + │ │ ├── filters: [] + │ │ ├── order by: [] + │ │ ├── limit: NONE + │ │ ├── output columns: [numbers.number (#2)] + │ │ ├── outer columns: [] + │ │ ├── used columns: [numbers.number (#2)] + │ │ ├── cardinality: 0.000 + │ │ ├── precise cardinality: N/A + │ │ └── statistics + │ └── Aggregate(Initial) + │ ├── group items: [b (#5) AS (#5)] + │ ├── aggregate functions: [] + │ ├── output columns: [b (#5)] + │ ├── outer columns: [] + │ ├── used columns: [a (#4), b (#5)] + │ ├── cardinality: 1.000 + │ ├── precise cardinality: N/A + │ ├── statistics + │ └── Scan + │ ├── table: testdb.t (#0) + │ ├── filters: [] + │ ├── order by: [] + │ ├── limit: NONE + │ ├── output columns: [a (#4), b (#5)] + │ ├── outer columns: [] + │ ├── used columns: [a (#4), b (#5)] + │ ├── cardinality: 0.000 + │ ├── precise cardinality: N/A + │ └── statistics + └── Scan + ├── table: testdb.t (#0) + ├── filters: [] + ├── order by: [] + ├── limit: NONE + ├── output columns: [t.a (#0), t.b (#1)] ├── outer columns: [] - ├── used columns: [a (#4), b (#5), numbers.number (#2)] + ├── used columns: [t.a (#0), t.b (#1)] ├── cardinality: 0.000 ├── precise cardinality: N/A - ├── statistics - └── Filter - ├── filters: [eq(modulo(numbers.number (#2), 10), b (#5))] - ├── output columns: [b (#5), numbers.number (#2)] - ├── outer columns: [] - ├── used columns: [a (#4), b (#5), numbers.number (#2)] - ├── cardinality: 0.000 - ├── precise cardinality: N/A - ├── statistics - └── Join(Cross) - ├── build keys: [] - ├── probe keys: [] - ├── other filters: [] - ├── output columns: [b (#5), numbers.number (#2)] - ├── outer columns: [] - ├── used columns: [a (#4), b (#5), numbers.number (#2)] - ├── cardinality: 0.000 - ├── precise cardinality: N/A - ├── statistics - ├── Aggregate(Initial) - │ ├── group items: [b (#5) AS (#5)] - │ ├── aggregate functions: [] - │ ├── output columns: [b (#5)] - │ ├── outer columns: [] - │ ├── used columns: [a (#4), b (#5)] - │ ├── cardinality: 1.000 - │ ├── precise cardinality: N/A - │ ├── statistics - │ └── Scan - │ ├── table: testdb.t (#0) - │ ├── filters: [] - │ ├── order by: [] - │ ├── limit: NONE - │ ├── output columns: [a (#4), b (#5)] - │ ├── outer columns: [] - │ ├── used columns: [a (#4), b (#5)] - │ ├── cardinality: 0.000 - │ ├── precise cardinality: N/A - │ └── statistics - └── Scan - ├── table: system.numbers (#1) - ├── filters: [] - ├── order by: [] - ├── limit: NONE - ├── output columns: [numbers.number (#2)] - ├── outer columns: [] - ├── used columns: [numbers.number (#2)] - ├── cardinality: 0.000 - ├── precise cardinality: N/A - └── statistics + └── statistics statement ok diff --git a/tests/sqllogictests/suites/mode/standalone/explain/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test b/tests/sqllogictests/suites/mode/standalone/explain/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test index 5a84de97e4a5e..992024abf7f75 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test @@ -70,7 +70,7 @@ explain select * from t1 inner join t2 on t1.a = t2.a where t2.a <= 2 or (t1.a > Filter ├── output columns: [t1.a (#0), t1.b (#1), t2.b (#3), t2.a (#2)] ├── filters: [is_true((t2.a (#2) <= 2 OR (t1.a (#0) > 1 AND t2.a (#2) > 1)))] -├── estimated rows: 3.56 +├── estimated rows: 2.00 └── HashJoin ├── output columns: [t1.a (#0), t1.b (#1), t2.b (#3), t2.a (#2)] ├── join type: INNER @@ -80,11 +80,11 @@ Filter ├── filters: [] ├── build join filters: │ └── filter id:0, build key:t2.a (#2), probe targets:[t1.a (#0)@scan0], filter type:bloom,inlist,min_max - ├── estimated rows: 3.56 + ├── estimated rows: 2.67 ├── Filter(Build) │ ├── output columns: [t2.a (#2), t2.b (#3)] │ ├── filters: [is_true((t2.a (#2) <= 2 OR t2.a (#2) > 1))] - │ ├── estimated rows: 3.00 + │ ├── estimated rows: 2.25 │ └── TableScan │ ├── table: default.default.t2 │ ├── scan id: 1 diff --git a/tests/sqllogictests/suites/mode/standalone/explain/subquery.test b/tests/sqllogictests/suites/mode/standalone/explain/subquery.test index 43799dc3962bb..657e0fac1516e 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain/subquery.test +++ b/tests/sqllogictests/suites/mode/standalone/explain/subquery.test @@ -648,29 +648,29 @@ EvalScalar ├── build keys: [] ├── probe keys: [] ├── other filters: [] - ├── Scan - │ ├── table: default.t (#0) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── EvalScalar - ├── scalars: [eq(count(*) (#3), 1) AS (#4)] - └── Aggregate(Final) - ├── group items: [] - ├── aggregate functions: [count() AS (#3)] - └── Aggregate(Partial) - ├── group items: [] - ├── aggregate functions: [count() AS (#3)] - └── Limit - ├── limit: [1] - ├── offset: [0] - └── Filter - ├── filters: [gt(t.i (#1), 10)] - └── Scan - ├── table: default.t (#1) - ├── filters: [gt(t.i (#1), 10)] - ├── order by: [] - └── limit: NONE + ├── EvalScalar + │ ├── scalars: [eq(count(*) (#3), 1) AS (#4)] + │ └── Aggregate(Final) + │ ├── group items: [] + │ ├── aggregate functions: [count() AS (#3)] + │ └── Aggregate(Partial) + │ ├── group items: [] + │ ├── aggregate functions: [count() AS (#3)] + │ └── Limit + │ ├── limit: [1] + │ ├── offset: [0] + │ └── Filter + │ ├── filters: [gt(t.i (#1), 10)] + │ └── Scan + │ ├── table: default.t (#1) + │ ├── filters: [gt(t.i (#1), 10)] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.t (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE query T @@ -682,28 +682,28 @@ EvalScalar ├── build keys: [] ├── probe keys: [] ├── other filters: [] - ├── Scan - │ ├── table: default.t (#0) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── EvalScalar - ├── scalars: [eq(count(*) (#3), 1) AS (#4)] - └── Aggregate(Initial) - ├── group items: [] - ├── aggregate functions: [count() AS (#3)] - └── Limit - ├── limit: [1] - ├── offset: [0] - └── EvalScalar - ├── scalars: [t.i (#1) AS (#1)] - └── Filter - ├── filters: [gt(t.i (#1), 10)] - └── Scan - ├── table: default.t (#1) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── EvalScalar + │ ├── scalars: [eq(count(*) (#3), 1) AS (#4)] + │ └── Aggregate(Initial) + │ ├── group items: [] + │ ├── aggregate functions: [count() AS (#3)] + │ └── Limit + │ ├── limit: [1] + │ ├── offset: [0] + │ └── EvalScalar + │ ├── scalars: [t.i (#1) AS (#1)] + │ └── Filter + │ ├── filters: [gt(t.i (#1), 10)] + │ └── Scan + │ ├── table: default.t (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Scan + ├── table: default.t (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE query T explain select t.number from numbers(10) as t where t.number = (select 1); diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test b/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test index 87d71eb3bde42..d3322402f5f16 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/explain.test @@ -115,12 +115,12 @@ EvalScalar ├── probe keys: [] ├── other filters: [] ├── Scan - │ ├── table: default.t1 (#0) + │ ├── table: default.t2 (#1) │ ├── filters: [] │ ├── order by: [] │ └── limit: NONE └── Scan - ├── table: default.t2 (#1) + ├── table: default.t1 (#0) ├── filters: [] ├── order by: [] └── limit: NONE @@ -134,18 +134,18 @@ EvalScalar ├── build keys: [t2.a (#2), t2.b (#3)] ├── probe keys: [t1.a (#0), t1.b (#1)] ├── other filters: [] - ├── Filter - │ ├── filters: [gt(t1.a (#0), 2)] - │ └── Scan - │ ├── table: default.t1 (#0) - │ ├── filters: [] - │ ├── order by: [] - │ └── limit: NONE - └── Scan - ├── table: default.t2 (#1) - ├── filters: [] - ├── order by: [] - └── limit: NONE + ├── Scan + │ ├── table: default.t2 (#1) + │ ├── filters: [] + │ ├── order by: [] + │ └── limit: NONE + └── Filter + ├── filters: [gt(t1.a (#0), 2)] + └── Scan + ├── table: default.t1 (#0) + ├── filters: [] + ├── order by: [] + └── limit: NONE query T explain syntax select 1, 'ab', [1,2,3], (1, 'a') diff --git a/tests/sqllogictests/suites/mode/standalone/explain_native/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test b/tests/sqllogictests/suites/mode/standalone/explain_native/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test index 294ccc99c7f0c..5e6f2be96ab34 100644 --- a/tests/sqllogictests/suites/mode/standalone/explain_native/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test +++ b/tests/sqllogictests/suites/mode/standalone/explain_native/push_down_filter/push_down_filter_join/push_down_filter_join_inner.test @@ -62,7 +62,7 @@ explain select * from t1 inner join t2 on t1.a = t2.a where t2.a <= 2 or (t1.a > Filter ├── output columns: [t1.a (#0), t1.b (#1), t2.b (#3), t2.a (#2)] ├── filters: [is_true((t2.a (#2) <= 2 OR (t1.a (#0) > 1 AND t2.a (#2) > 1)))] -├── estimated rows: 3.56 +├── estimated rows: 2.00 └── HashJoin ├── output columns: [t1.a (#0), t1.b (#1), t2.b (#3), t2.a (#2)] ├── join type: INNER @@ -72,7 +72,7 @@ Filter ├── filters: [] ├── build join filters: │ └── filter id:0, build key:t2.a (#2), probe targets:[t1.a (#0)@scan0], filter type:bloom,inlist,min_max - ├── estimated rows: 3.56 + ├── estimated rows: 2.67 ├── TableScan(Build) │ ├── table: default.default.t2 │ ├── scan id: 1 @@ -83,7 +83,7 @@ Filter │ ├── partitions scanned: 1 │ ├── pruning stats: [segments: >, blocks: >] │ ├── push downs: [filters: [is_true((t2.a (#2) <= 2 OR t2.a (#2) > 1))], limit: NONE] - │ └── estimated rows: 3.00 + │ └── estimated rows: 2.25 └── TableScan(Probe) ├── table: default.default.t1 ├── scan id: 0 diff --git a/tests/sqllogictests/suites/tpch/join_order.test b/tests/sqllogictests/suites/tpch/join_order.test index 9113f51ca95db..557d1b7f8177c 100644 --- a/tests/sqllogictests/suites/tpch/join_order.test +++ b/tests/sqllogictests/suites/tpch/join_order.test @@ -240,15 +240,15 @@ HashJoin: INNER │ ├── Build │ │ └── HashJoin: INNER │ │ ├── Build - │ │ │ └── HashJoin: INNER - │ │ │ ├── Build - │ │ │ │ └── Scan: default.tpch_test.region (#5) (read rows: 5) - │ │ │ └── Probe - │ │ │ └── Scan: default.tpch_test.nation (#4) (read rows: 25) + │ │ │ └── Scan: default.tpch_test.region (#5) (read rows: 5) │ │ └── Probe - │ │ └── Scan: default.tpch_test.customer (#0) (read rows: 150000) + │ │ └── Scan: default.tpch_test.nation (#4) (read rows: 25) │ └── Probe - │ └── Scan: default.tpch_test.orders (#1) (read rows: 1500000) + │ └── HashJoin: INNER + │ ├── Build + │ │ └── Scan: default.tpch_test.customer (#0) (read rows: 150000) + │ └── Probe + │ └── Scan: default.tpch_test.orders (#1) (read rows: 1500000) └── Probe └── Scan: default.tpch_test.lineitem (#2) (read rows: 6001215) @@ -372,7 +372,11 @@ order by ---- HashJoin: INNER ├── Build -│ └── Scan: default.tpch_test.nation (#6) (read rows: 25) +│ └── HashJoin: INNER +│ ├── Build +│ │ └── Scan: default.tpch_test.nation (#6) (read rows: 25) +│ └── Probe +│ └── Scan: default.tpch_test.supplier (#1) (read rows: 10000) └── Probe └── HashJoin: INNER ├── Build @@ -384,21 +388,17 @@ HashJoin: INNER │ │ └── Probe │ │ └── Scan: default.tpch_test.nation (#5) (read rows: 25) │ └── Probe - │ └── HashJoin: INNER - │ ├── Build - │ │ └── HashJoin: INNER - │ │ ├── Build - │ │ │ └── HashJoin: INNER - │ │ │ ├── Build - │ │ │ │ └── Scan: default.tpch_test.part (#0) (read rows: 200000) - │ │ │ └── Probe - │ │ │ └── Scan: default.tpch_test.lineitem (#2) (read rows: 6001215) - │ │ └── Probe - │ │ └── Scan: default.tpch_test.orders (#3) (read rows: 1500000) - │ └── Probe - │ └── Scan: default.tpch_test.customer (#4) (read rows: 150000) + │ └── Scan: default.tpch_test.customer (#4) (read rows: 150000) └── Probe - └── Scan: default.tpch_test.supplier (#1) (read rows: 10000) + └── HashJoin: INNER + ├── Build + │ └── HashJoin: INNER + │ ├── Build + │ │ └── Scan: default.tpch_test.part (#0) (read rows: 200000) + │ └── Probe + │ └── Scan: default.tpch_test.lineitem (#2) (read rows: 6001215) + └── Probe + └── Scan: default.tpch_test.orders (#3) (read rows: 1500000) # Q9 query I