Use new API to do Iceberg partition. (#13688)

res-life · Chong Gao · web-flow · commit c1cc40b698e2 · 2025-11-13T19:12:41.000+08:00
Fixes #13679. Use new API to do Iceberg partition. Signed-off-by: Chong Gao <res_life@163.com> Depends on * rapidsai/cudf#20391 ### Checklists - [x] This PR has added documentation for new or modified features or behaviors. - [x] This PR has added new tests or modified existing tests to cover new code paths. - [x] Performance testing has been performed and its results are added in the PR description. Or, an issue has been filed with a link in the PR description. Perf test is not needed, because it's obvious that new API is fast. New API reduces multiple kernels into one kernel. --------- Signed-off-by: Chong Gao <res_life@163.com> Co-authored-by: Chong Gao <res_life@163.com>
diff --git a/iceberg/src/main/scala/com/nvidia/spark/rapids/iceberg/GpuIcebergPartitioner.scala b/iceberg/src/main/scala/com/nvidia/spark/rapids/iceberg/GpuIcebergPartitioner.scala
@@ -20,7 +20,7 @@ import java.lang.Math.toIntExact
 
 import scala.collection.JavaConverters._
 
-import ai.rapids.cudf.{ColumnVector => CudfColumnVector, OrderByArg, Scalar, Table}
+import ai.rapids.cudf.{ColumnVector => CudfColumnVector, Table}
 import com.nvidia.spark.rapids.{GpuBoundReference, GpuColumnVector, GpuExpression, GpuLiteral, RapidsHostColumnVector, SpillableColumnarBatch, SpillPriorities}
 import com.nvidia.spark.rapids.Arm.{closeOnExcept, withResource}
 import com.nvidia.spark.rapids.RapidsPluginImplicits.AutoCloseableProducingSeq
@@ -54,10 +54,37 @@ class GpuIcebergPartitioner(val spec: PartitionSpec,
   private val partitionExprs: Seq[GpuExpression] = spec.fields().asScala.map(getPartitionExpr).toSeq
 
   private val keyColNum: Int = spec.fields().size()
+  private val inputColNum: Int = dataSparkType.fields.length
+
+  // key column indices in the table: [key columns, input columns]
   private val keyColIndices: Array[Int] = (0 until keyColNum).toArray
-  private val keySortOrders: Array[OrderByArg] = (0 until keyColNum)
-    .map(OrderByArg.asc(_, true))
-    .toArray
+  // input column indices in the table: [key columns, input columns]
+  private val inputColumnIndices: Array[Int] = (keyColNum until (keyColNum + inputColNum)).toArray
+
+  /**
+   * Make a new table: [key columns, input columns]
+   */
+  private def makeKeysAndInputTable(spillableInput: SpillableColumnarBatch): Table = {
+    withResource(spillableInput.getColumnarBatch()) { inputBatch =>
+      // compute keys columns
+      val keyCols = partitionExprs.safeMap(_.columnarEval(inputBatch))
+
+      // combine keys columns and input columns into a new table
+      withResource(keyCols) { _ =>
+        withResource(GpuColumnVector.from(inputBatch)) { inputTable =>
+          val numCols = keyCols.size + inputTable.getNumberOfColumns
+          val cols = new Array[CudfColumnVector](numCols)
+          for (i <- keyCols.indices) {
+            cols(i) = keyCols(i).getBase
+          }
+          for (i <- 0 until inputTable.getNumberOfColumns) {
+            cols(i + keyCols.size) = inputTable.getColumn(i)
+          }
+          new Table(cols:_*)
+        }
+      }
+    }
+  }
 
   /**
    * Partition the `input` columnar batch using iceberg's partition spec.
@@ -70,94 +97,41 @@ class GpuIcebergPartitioner(val spec: PartitionSpec,
       return Seq.empty
     }
 
-    val numRows = input.numRows()
-
     val spillableInput = closeOnExcept(input) { _ =>
       SpillableColumnarBatch(input, ACTIVE_ON_DECK_PRIORITY)
     }
 
-    val (partitionKeys, partitions) = withRetryNoSplit(spillableInput) { scb =>
-      val parts = withResource(scb.getColumnarBatch()) { inputBatch =>
-        partitionExprs.safeMap(_.columnarEval(inputBatch))
-      }
-      val keysTable = withResource(parts) { _ =>
-        val arr = new Array[CudfColumnVector](partitionExprs.size)
-        for (i <- partitionExprs.indices) {
-          arr(i) = parts(i).getBase
-        }
-        new Table(arr:_*)
-      }
-
-      val sortedKeyTableWithRowIdx = withResource(keysTable) { _ =>
-        withResource(Scalar.fromInt(0)) { zero =>
-          withResource(CudfColumnVector.sequence(zero, numRows)) { rowIdxCol =>
-            val totalColCount = keysTable.getNumberOfColumns + 1
-            val allCols = new Array[CudfColumnVector](totalColCount)
-
-            for (i <- 0 until keysTable.getNumberOfColumns) {
-              allCols(i) = keysTable.getColumn(i)
-            }
-            allCols(keysTable.getNumberOfColumns) = rowIdxCol
-
-            withResource(new Table(allCols: _*)) { allColsTable =>
-              allColsTable.orderBy(keySortOrders: _*)
-            }
-          }
-        }
-      }
-
-      val (sortedPartitionKeys, splitIds, rowIdxCol) = withResource(sortedKeyTableWithRowIdx) { _ =>
-        val uniqueKeysTable = sortedKeyTableWithRowIdx.groupBy(keyColIndices: _*)
-          .aggregate()
-
-        val sortedUniqueKeysTable = withResource(uniqueKeysTable) { _ =>
-          uniqueKeysTable.orderBy(keySortOrders: _*)
-        }
-
-        val (sortedPartitionKeys, splitIds) = withResource(sortedUniqueKeysTable) { _ =>
-          val partitionKeys = toPartitionKeys(spec.partitionType(),
-            partitionSparkType,
-            sortedUniqueKeysTable)
-
-          val splitIdsCv = sortedKeyTableWithRowIdx.upperBound(
-            sortedUniqueKeysTable,
-            keySortOrders: _*)
-
-          val splitIds = withResource(splitIdsCv) { _ =>
-            GpuColumnVector.toIntArray(splitIdsCv)
-          }
-
-          (partitionKeys, splitIds)
-        }
+    withRetryNoSplit(spillableInput) { scb =>
+      // make table: [key columns, input columns]
+      val keysAndInputTable = makeKeysAndInputTable(scb)
 
-        val rowIdxCol = sortedKeyTableWithRowIdx.getColumn(keyColNum).incRefCount()
-        (sortedPartitionKeys, splitIds, rowIdxCol)
+      // split the input columns by the key columns,
+      // note: the result does not contain the key columns
+      val splitRet = withResource(keysAndInputTable) { _ =>
+        keysAndInputTable.groupBy(keyColIndices: _*)
+          .contiguousSplitGroupsAndGenUniqKeys(inputColumnIndices)
       }
 
-      withResource(rowIdxCol) { _ =>
-        val inputTable = withResource(scb.getColumnarBatch()) { inputBatch =>
-          GpuColumnVector.from(inputBatch)
-        }
+      // generate results
+      withResource(splitRet) { _ =>
+        // generate the partition keys on the host side
+        val partitionKeys = toPartitionKeys(spec.partitionType(),
+          partitionSparkType,
+          splitRet.getUniqKeyTable)
 
-        val sortedDataTable = withResource(inputTable) { _ =>
-          inputTable.gather(rowIdxCol)
-        }
+        // release unique table to save GPU memory
+        splitRet.closeUniqKeyTable()
 
-        val partitions = withResource(sortedDataTable) { _ =>
-          sortedDataTable.contiguousSplit(splitIds: _*)
-        }
+        // get the partitions
+        val partitions = splitRet.getGroups
 
-        (sortedPartitionKeys, partitions)
+        // combine the partition keys and partitioned tables
+        partitionKeys.zip(partitions).map { case (partKey, partition) =>
+          ColumnarBatchWithPartition(SpillableColumnarBatch(partition, sparkType, SpillPriorities
+            .ACTIVE_BATCHING_PRIORITY), partKey)
+        }.toSeq
       }
     }
-
-    withResource(partitions) { _ =>
-      partitionKeys.zip(partitions).map { case (partKey, partition) =>
-        ColumnarBatchWithPartition(SpillableColumnarBatch(partition, sparkType, SpillPriorities
-          .ACTIVE_BATCHING_PRIORITY), partKey)
-      }.toSeq
-    }
-
   }
 
   private def getPartitionExpr(field: PartitionField)
@@ -208,4 +182,4 @@ object GpuIcebergPartitioner {
           }).toArray
     }
   }
-}
+}