databendlabs · zhang2014 · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025 · Jan 3, 2026
diff --git a/src/query/sql/src/planner/optimizer/ir/stats/column_stat.rs b/src/query/sql/src/planner/optimizer/ir/stats/column_stat.rs
@@ -37,6 +37,9 @@ pub struct ColumnStat {
     /// Count of null values
     pub null_count: u64,
 
+    pub origin_min: Datum,
+    pub origin_max: Datum,
+
     /// Histogram of column
     pub histogram: Option<Histogram>,
 }

diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/agg_rules/mod.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/agg_rules/mod.rs
@@ -19,6 +19,7 @@ mod rule_grouping_sets_to_union;
 mod rule_hierarchical_grouping_sets;
 mod rule_push_down_filter_aggregate;
 mod rule_push_down_limit_aggregate;
+mod rule_shrink_group_by_type;
 mod rule_split_aggregate;
 mod rule_try_apply_agg_index;
 
@@ -28,5 +29,6 @@ pub use rule_grouping_sets_to_union::RuleGroupingSetsToUnion;
 pub use rule_hierarchical_grouping_sets::RuleHierarchicalGroupingSetsToUnion;
 pub use rule_push_down_filter_aggregate::RulePushDownFilterAggregate;
 pub use rule_push_down_limit_aggregate::RulePushDownRankLimitAggregate;
+pub use rule_shrink_group_by_type::RuleShrinkGroupByType;
 pub use rule_split_aggregate::RuleSplitAggregate;
 pub use rule_try_apply_agg_index::RuleTryApplyAggIndex;
diff --git a/src/query/sql/src/planner/optimizer/optimizers/rule/agg_rules/rule_shrink_group_by_type.rs b/src/query/sql/src/planner/optimizer/optimizers/rule/agg_rules/rule_shrink_group_by_type.rs
@@ -0,0 +1,371 @@
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use databend_common_exception::Result;
+use databend_common_expression::types::DataType;
+use databend_common_expression::types::NumberDataType;
+use databend_common_storage::Datum;
+
+use crate::IndexType;
+use crate::MetadataRef;
+use crate::Visibility;
+use crate::binder::ColumnBinding;
+use crate::binder::ColumnBindingBuilder;
+use crate::optimizer::ir::ColumnStat;
+use crate::optimizer::ir::Matcher;
+use crate::optimizer::ir::RelExpr;
+use crate::optimizer::ir::SExpr;
+use crate::optimizer::optimizers::rule::Rule;
+use crate::optimizer::optimizers::rule::RuleID;
+use crate::optimizer::optimizers::rule::TransformResult;
+use crate::plans::Aggregate;
+use crate::plans::AggregateMode;
+use crate::plans::BoundColumnRef;
+use crate::plans::CastExpr;
+use crate::plans::EvalScalar;
+use crate::plans::RelOp;
+use crate::plans::ScalarExpr;
+use crate::plans::ScalarItem;
+
+pub struct RuleShrinkGroupByType {
+    id: RuleID,
+    matchers: Vec<Matcher>,
+    metadata: MetadataRef,
+}
+
+impl RuleShrinkGroupByType {
+    pub fn new(metadata: MetadataRef) -> Self {
+        Self {
+            id: RuleID::ShrinkGroupByType,
+            matchers: vec![Matcher::MatchOp {
+                op_type: RelOp::Aggregate,
+                children: vec![Matcher::Leaf],
+            }],
+            metadata,
+        }
+    }
+}
+
+impl Rule for RuleShrinkGroupByType {
+    fn id(&self) -> RuleID {
+        self.id
+    }
+
+    fn apply(&self, s_expr: &SExpr, state: &mut TransformResult) -> Result<()> {
+        let agg: Aggregate = s_expr.plan().clone().try_into()?;
+        if agg.group_items.is_empty()
+            || agg.mode != AggregateMode::Initial
+            || agg.grouping_sets.is_some()
+        {
+            return Ok(());
+        }
+
+        let rel_expr = RelExpr::with_s_expr(s_expr);
+        let child_stat = rel_expr.derive_cardinality_child(0)?;
+        let column_stats = &child_stat.statistics.column_stats;
+
+        let mut rewrites = Vec::with_capacity(agg.group_items.len());
+        for (idx, item) in agg.group_items.iter().enumerate() {
+            let ScalarExpr::BoundColumnRef(col_ref) = &item.scalar else {
+                continue;
+            };
+            let Some(stat) = column_stats.get(&item.index) else {
+                continue;
+            };
+            let Some(target_type) =
+                shrink_group_by_data_type(col_ref.column.data_type.as_ref(), stat)
+            else {
+                continue;
+            };
+
+            let mut writer = self.metadata.write();
+            let shrink_index = writer.add_derived_column(
+                format!("{}_shrink", col_ref.column.column_name),
+                target_type.clone(),
+            );
+            drop(writer);
+
+            let shrink_binding = ColumnBindingBuilder::new(
+                format!("{}_shrink", col_ref.column.column_name),
+                shrink_index,
+                Box::new(target_type.clone()),
+                Visibility::Visible,
+            )
+            .build();
+
+            rewrites.push(GroupByRewrite {
+                position: idx,
+                original_binding: col_ref.column.clone(),
+                original_index: item.index,
+                shrink_binding,
+                shrink_type: target_type,
+                shrink_index,
+            });
+        }
+
+        if rewrites.is_empty() {
+            return Ok(());
+        }
+
+        let mut new_group_items = agg.group_items.clone();
+        let mut lower_items = Vec::with_capacity(rewrites.len());
+        let mut upper_items = Vec::with_capacity(rewrites.len());
+
+        for rewrite in rewrites.iter() {
+            new_group_items[rewrite.position].index = rewrite.shrink_index;
+            new_group_items[rewrite.position].scalar = ScalarExpr::BoundColumnRef(BoundColumnRef {
+                span: None,
+                column: rewrite.shrink_binding.clone(),
+            });
+
+            lower_items.push(ScalarItem {
+                index: rewrite.shrink_index,
+                scalar: ScalarExpr::CastExpr(CastExpr {
+                    span: None,
+                    is_try: false,
+                    argument: Box::new(ScalarExpr::BoundColumnRef(BoundColumnRef {
+                        span: None,
+                        column: rewrite.original_binding.clone(),
+                    })),
+                    target_type: Box::new(rewrite.shrink_type.clone()),
+                }),
+            });
+
+            upper_items.push(ScalarItem {
+                index: rewrite.original_index,
+                scalar: ScalarExpr::CastExpr(CastExpr {
+                    span: None,
+                    is_try: false,
+                    argument: Box::new(ScalarExpr::BoundColumnRef(BoundColumnRef {
+                        span: None,
+                        column: rewrite.shrink_binding.clone(),
+                    })),
+                    target_type: Box::new((*rewrite.original_binding.data_type).clone()),
+                }),
+            });
+        }
+
+        let mut new_child = s_expr.child(0)?.clone();
+        if !lower_items.is_empty() {
+            new_child = SExpr::create_unary(
+                Arc::new(EvalScalar { items: lower_items }.into()),
+                Arc::new(new_child),
+            );
+        }
+
+        let mut new_agg = agg;
+        new_agg.group_items = new_group_items;
+        let mut new_expr = SExpr::create_unary(Arc::new(new_agg.into()), Arc::new(new_child));
+
+        if !upper_items.is_empty() {
+            new_expr = SExpr::create_unary(
+                Arc::new(EvalScalar { items: upper_items }.into()),
+                Arc::new(new_expr),
+            );
+        }
+
+        state.add_result(new_expr);
+        Ok(())
+    }
+
+    fn matchers(&self) -> &[Matcher] {
+        &self.matchers
+    }
+}
+
+struct GroupByRewrite {
+    position: usize,
+    original_binding: ColumnBinding,
+    original_index: IndexType,
+    shrink_binding: ColumnBinding,
+    shrink_type: DataType,
+    shrink_index: IndexType,
+}
+
+fn shrink_group_by_data_type(data_type: &DataType, stat: &ColumnStat) -> Option<DataType> {
+    match data_type {
+        DataType::Nullable(inner) => {
+            shrink_group_by_data_type(inner, stat).map(|dt| DataType::Nullable(Box::new(dt)))
+        }
+        DataType::Number(number_type) => {
+            shrink_number_type(*number_type, &stat.origin_min, &stat.origin_max)
+                .map(DataType::Number)
+        }
+        _ => None,
+    }
+}
+
+fn shrink_number_type(
+    number_type: NumberDataType,
+    min: &Datum,
+    max: &Datum,
+) -> Option<NumberDataType> {
+    use NumberDataType::*;
+    match number_type {
+        UInt8 | Int8 => None,
+        UInt16 => shrink_unsigned(min, max, &[UInt8]),
+        UInt32 => shrink_unsigned(min, max, &[UInt8, UInt16]),
+        UInt64 => shrink_unsigned(min, max, &[UInt8, UInt16, UInt32]),
+        Int16 => shrink_signed(min, max, &[Int8]),
+        Int32 => shrink_signed(min, max, &[Int8, Int16]),
+        Int64 => shrink_signed(min, max, &[Int8, Int16, Int32]),
+        _ => None,
+    }
+}
+
+fn shrink_unsigned(
+    min: &Datum,
+    max: &Datum,
+    candidates: &[NumberDataType],
+) -> Option<NumberDataType> {
+    let (Some(min_v), Some(max_v)) = (datum_to_u128(min), datum_to_u128(max)) else {
+        return None;
+    };
+    if max_v < min_v {
+        return None;
+    }
+    for candidate in candidates {
+        match candidate {
+            NumberDataType::UInt8 if max_v <= u8::MAX as u128 => return Some(*candidate),
+            NumberDataType::UInt16 if max_v <= u16::MAX as u128 => return Some(*candidate),
+            NumberDataType::UInt32 if max_v <= u32::MAX as u128 => return Some(*candidate),
+            _ => continue,
+        }
+    }
+    None
+}
+
+fn shrink_signed(
+    min: &Datum,
+    max: &Datum,
+    candidates: &[NumberDataType],
+) -> Option<NumberDataType> {
+    let (Some(min_v), Some(max_v)) = (datum_to_i128(min), datum_to_i128(max)) else {
+        return None;
+    };
+    if max_v < min_v {
+        return None;
+    }
+    for candidate in candidates {
+        match candidate {
+            NumberDataType::Int8 if min_v >= i8::MIN as i128 && max_v <= i8::MAX as i128 => {
+                return Some(*candidate);
+            }
+            NumberDataType::Int16 if min_v >= i16::MIN as i128 && max_v <= i16::MAX as i128 => {
+                return Some(*candidate);
+            }
+            NumberDataType::Int32 if min_v >= i32::MIN as i128 && max_v <= i32::MAX as i128 => {
+                return Some(*candidate);
+            }
+            _ => continue,
+        }
+    }
+    None
+}
+
+fn datum_to_u128(value: &Datum) -> Option<u128> {
+    match value {
+        Datum::UInt(v) => Some(*v as u128),
+        Datum::Int(v) if *v >= 0 => Some(*v as u128),
+        _ => None,
+    }
+}
+
+fn datum_to_i128(value: &Datum) -> Option<i128> {
+    match value {
+        Datum::Int(v) => Some(*v as i128),
+        Datum::UInt(v) => Some(*v as i128),
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use databend_common_storage::Datum;
+
+    use super::*;
+    use crate::optimizer::ir::ColumnStat;
+    use crate::optimizer::ir::Ndv;
+
+    #[test]
+    fn test_shrink_unsigned() {
+        let stat = ColumnStat {
+            min: Datum::UInt(0),
+            max: Datum::UInt(100),
+            ndv: Ndv::Stat(10.0),
+            null_count: 0,
+            origin_min: Datum::UInt(0),
+            origin_max: Datum::UInt(100),
+            histogram: None,
+        };
+        let t = shrink_group_by_data_type(&DataType::Number(NumberDataType::UInt64), &stat);
+        assert_eq!(t, Some(DataType::Number(NumberDataType::UInt8)));
+    }
+
+    #[test]
+    fn test_shrink_signed() {
+        let stat = ColumnStat {
+            min: Datum::Int(-100),
+            max: Datum::Int(100),
+            ndv: Ndv::Stat(10.0),
+            null_count: 0,
+            origin_min: Datum::Int(-100),
+            origin_max: Datum::Int(100),
+            histogram: None,
+        };
+        let t = shrink_group_by_data_type(&DataType::Number(NumberDataType::Int64), &stat);
+        assert_eq!(t, Some(DataType::Number(NumberDataType::Int8)));
+    }
+
+    #[test]
+    fn test_shrink_nullable() {
+        let stat = ColumnStat {
+            min: Datum::UInt(0),
+            max: Datum::UInt(50000),
+            ndv: Ndv::Stat(10.0),
+            null_count: 0,
+            origin_min: Datum::UInt(0),
+            origin_max: Datum::UInt(50000),
+            histogram: None,
+        };
+        let t = shrink_group_by_data_type(
+            &DataType::Nullable(Box::new(DataType::Number(NumberDataType::UInt64))),
+            &stat,
+        );
+        assert_eq!(
+            t,
+            Some(DataType::Nullable(Box::new(DataType::Number(
+                NumberDataType::UInt16
+            ))))
+        );
+    }
+
+    #[test]
+    fn test_no_shrink() {
+        let stat = ColumnStat {
+            min: Datum::UInt(0),
+            max: Datum::UInt(u32::MAX as u64 + 1),
+            ndv: Ndv::Stat(10.0),
+            null_count: 0,
+            origin_min: Datum::UInt(0),
+            origin_max: Datum::UInt(u32::MAX as u64 + 1),
+            histogram: None,
+        };
+        assert!(
+            shrink_group_by_data_type(&DataType::Number(NumberDataType::UInt64), &stat).is_none()
+        );
+    }
+}