NVIDIA · jlowe · Jul 12, 2024 · Jul 11, 2024 · Jul 12, 2024
diff --git a/integration_tests/src/main/python/delta_lake_write_test.py b/integration_tests/src/main/python/delta_lake_write_test.py
@@ -1049,3 +1049,27 @@ def test_delta_write_column_name_mapping(spark_tmp_path, mapping):
         lambda spark, path: spark.read.format("delta").load(path),
         data_path,
         conf=confs)
+
+# Hash aggregate can be used in a metadata query for compaction which completely falls back
+compaction_allow = "HashAggregateExec"
+if is_databricks_runtime():
+    # compaction can fallback due to unsupported WriteIntoDeltaCommand
+    # tracked by https://github.com/NVIDIA/spark-rapids/issues/11169
+    compaction_allow += "," + delta_write_fallback_allow
+@allow_non_gpu(compaction_allow, *delta_meta_allow)
+@delta_lake
+@ignore_order
+def test_delta_compaction(spark_tmp_path):
+    from delta.tables import DeltaTable
+    def do_write(spark,  path):
+        spark.range(1000).write.mode("append").format("delta").save(path)
+        DeltaTable.forPath(spark, path).optimize().executeCompaction()
+    data_path = spark_tmp_path + "/DELTA_DATA"
+    confs = _delta_confs
+    with_cpu_session(
+        lambda spark: _create_cpu_gpu_tables(spark, data_path, "id bigint"), conf=confs)
+    assert_gpu_and_cpu_writes_are_equal_collect(
+        do_write,
+        lambda spark, path: spark.read.format("delta").load(path),
+        data_path,
+        conf=confs)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -4646,8 +4646,14 @@ case class GpuOverrides() extends Rule[SparkPlan] with Logging {
         }
 
         // example filename: "file:/tmp/delta-table/_delta_log/00000000000000000000.json"
-        val found = f.relation.inputFiles.exists { name =>
-          checkDeltaFunc(name)
+        val found = StaticPartitionShims.getStaticPartitions(f.relation).map { parts =>
+          parts.exists { part =>
+            part.files.exists(partFile => checkDeltaFunc(partFile.filePath.toString))
+          }
+        }.getOrElse {
+          f.relation.inputFiles.exists { name =>
+            checkDeltaFunc(name)
+          }
         }
         if (found) {
           logDebug(s"Fallback for FileSourceScanExec delta log: $f")

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileSourceScanExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuFileSourceScanExec.scala
@@ -22,7 +22,7 @@ import scala.collection.mutable.HashMap
 
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.filecache.FileCacheLocalityManager
-import com.nvidia.spark.rapids.shims.{GpuDataSourceRDD, PartitionedFileUtilsShim, SparkShimImpl}
+import com.nvidia.spark.rapids.shims.{GpuDataSourceRDD, PartitionedFileUtilsShim, SparkShimImpl, StaticPartitionShims}
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.rdd.RDD
@@ -380,7 +380,7 @@ case class GpuFileSourceScanExec(
       createBucketedReadRDD(relation.bucketSpec.get, readFile, dynamicallySelectedPartitions,
         relation)
     } else {
-      createNonBucketedReadRDD(readFile, dynamicallySelectedPartitions, relation)
+      createNonBucketedReadRDD(readFile, relation)
     }
     sendDriverMetrics()
     readRDD
@@ -550,24 +550,22 @@ case class GpuFileSourceScanExec(
    *
    * @param readFile an optional function to read each (part of a) file. Used when
    *                 not using the small file optimization.
-   * @param selectedPartitions Hive-style partition that are part of the read.
    * @param fsRelation [[HadoopFsRelation]] associated with the read.
    */
   private def createNonBucketedReadRDD(
       readFile: Option[(PartitionedFile) => Iterator[InternalRow]],
-      selectedPartitions: Array[PartitionDirectory],
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
-    val openCostInBytes = fsRelation.sparkSession.sessionState.conf.filesOpenCostInBytes
-    val maxSplitBytes =
-      FilePartition.maxSplitBytes(fsRelation.sparkSession, selectedPartitions)
-    logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " +
-      s"open cost is considered as scanning $openCostInBytes bytes.")
+    val partitions = StaticPartitionShims.getStaticPartitions(fsRelation).getOrElse {
+      val openCostInBytes = fsRelation.sparkSession.sessionState.conf.filesOpenCostInBytes
+      val maxSplitBytes =
+        FilePartition.maxSplitBytes(fsRelation.sparkSession, selectedPartitions)
+      logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " +
+        s"open cost is considered as scanning $openCostInBytes bytes.")
 
-    val splitFiles = FilePartitionShims.splitFiles(selectedPartitions, relation, maxSplitBytes)
+      val splitFiles = FilePartitionShims.splitFiles(selectedPartitions, relation, maxSplitBytes)
 
-    val partitions =
       FilePartition.getFilePartitions(relation.sparkSession, splitFiles, maxSplitBytes)
-
+    }
     getFinalRDD(readFile, partitions)
   }
 

diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/StaticPartitionShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/StaticPartitionShims.scala
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "320"}
+{"spark": "321"}
+{"spark": "321cdh"}
+{"spark": "322"}
+{"spark": "323"}
+{"spark": "324"}
+{"spark": "330"}
+{"spark": "330cdh"}
+{"spark": "330db"}
+{"spark": "331"}
+{"spark": "332"}
+{"spark": "332cdh"}
+{"spark": "332db"}
+{"spark": "333"}
+{"spark": "334"}
+{"spark": "340"}
+{"spark": "341"}
+{"spark": "342"}
+{"spark": "343"}
+{"spark": "350"}
+{"spark": "351"}
+{"spark": "400"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import org.apache.spark.sql.execution.datasources.FilePartition
+import org.apache.spark.sql.execution.datasources.HadoopFsRelation
+
+object StaticPartitionShims {
+  /** Get the static partitions associated with a relation, if any. */
+  def getStaticPartitions(relation: HadoopFsRelation): Option[Seq[FilePartition]] = None
+}
diff --git a/...plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/StaticPartitionShims.scala b/...plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/StaticPartitionShims.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*** spark-rapids-shim-json-lines
+{"spark": "341db"}
+spark-rapids-shim-json-lines ***/
+package com.nvidia.spark.rapids.shims
+
+import com.databricks.sql.transaction.tahoe.files.TahoeFileIndexWithStaticPartitions
+
+import org.apache.spark.sql.execution.datasources.FilePartition
+import org.apache.spark.sql.execution.datasources.HadoopFsRelation
+
+object StaticPartitionShims {
+  /** Get the static partitions associated with a relation, if any. */
+  def getStaticPartitions(relation: HadoopFsRelation): Option[Seq[FilePartition]] = {
+    relation.location match {
+      case t: TahoeFileIndexWithStaticPartitions => Some(t.getStaticPartitions)
+      case _ => None
+    }
+  }
+}