airbnb · kambstreat · May 30, 2025 · Jun 3, 2025 · Jun 7, 2025 · Jun 14, 2025
diff --git a/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala b/aggregator/src/main/scala/ai/chronon/aggregator/row/RowAggregator.scala
@@ -70,6 +70,11 @@ class RowAggregator(val inputSchema: Seq[(String, DataType)], val aggregationPar
     .toArray
     .zip(columnAggregators.map(_.irType))
 
+  val incSchema = aggregationParts
-  val incSchema = aggregationParts
+  val incrementalSchema = aggregationParts
-  val incSchema = aggregationParts
+  val incrementalSchema = aggregationParts
+    .map(_.incOutputColumnName)
+    .toArray
+    .zip(columnAggregators.map(_.irType))
+
   val outputSchema: Array[(String, DataType)] = aggregationParts
     .map(_.outputColumnName)
     .toArray

diff --git a/api/src/main/scala/ai/chronon/api/Extensions.scala b/api/src/main/scala/ai/chronon/api/Extensions.scala
@@ -98,6 +98,7 @@ object Extensions {
     def cleanName: String = metaData.name.sanitize
 
     def outputTable = s"${metaData.outputNamespace}.${metaData.cleanName}"
+    def incOutputTable = s"${metaData.outputNamespace}.${metaData.cleanName}_inc"
     def outputLabelTable = s"${metaData.outputNamespace}.${metaData.cleanName}_labels"
     def outputFinalView = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled"
     def outputLatestLabelView = s"${metaData.outputNamespace}.${metaData.cleanName}_labeled_latest"
@@ -176,8 +177,13 @@ object Extensions {
 
     def outputColumnName =
       s"${aggregationPart.inputColumn}_$opSuffix${aggregationPart.window.suffix}${bucketSuffix}"
+
+    def incOutputColumnName =
+      s"${aggregationPart.inputColumn}_$opSuffix${bucketSuffix}"
+
   }
 
+
   implicit class AggregationOps(aggregation: Aggregation) {
 
     // one agg part per bucket per window

diff --git a/spark/src/main/scala/ai/chronon/spark/GroupBy.scala b/spark/src/main/scala/ai/chronon/spark/GroupBy.scala
@@ -18,6 +18,7 @@ package ai.chronon.spark
 
 import ai.chronon.aggregator.base.TimeTuple
 import ai.chronon.aggregator.row.RowAggregator
+import ai.chronon.aggregator.windowing.HopsAggregator.HopIr
 import ai.chronon.aggregator.windowing._
 import ai.chronon.api
 import ai.chronon.api.DataModel.{Entities, Events}
@@ -41,7 +42,9 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
               val inputDf: DataFrame,
               val mutationDfFn: () => DataFrame = null,
               skewFilter: Option[String] = None,
-              finalize: Boolean = true)
+              finalize: Boolean = true,
+              incAgg: Boolean = false
+             )
     extends Serializable {
   @transient lazy val logger = LoggerFactory.getLogger(getClass)
 
@@ -88,7 +91,11 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
   lazy val aggPartWithSchema = aggregationParts.zip(columnAggregators.map(_.outputType))
 
   lazy val postAggSchema: StructType = {
-    val valueChrononSchema = if (finalize) windowAggregator.outputSchema else windowAggregator.irSchema
+    val valueChrononSchema = if (finalize) {
+      windowAggregator.outputSchema
+    } else {
+      windowAggregator.irSchema
+    }
     SparkConversions.fromChrononSchema(valueChrononSchema)
   }
 
@@ -141,12 +148,13 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     }
 
   def snapshotEventsBase(partitionRange: PartitionRange,
-                         resolution: Resolution = DailyResolution): RDD[(Array[Any], Array[Any])] = {
+                         resolution: Resolution = DailyResolution,
+                         incAgg: Boolean = true): RDD[(Array[Any], Array[Any])] = {
     val endTimes: Array[Long] = partitionRange.toTimePoints
     // add 1 day to the end times to include data [ds 00:00:00.000, ds + 1 00:00:00.000)
     val shiftedEndTimes = endTimes.map(_ + tableUtils.partitionSpec.spanMillis)
     val sawtoothAggregator = new SawtoothAggregator(aggregations, selectedSchema, resolution)
-    val hops = hopsAggregate(endTimes.min, resolution)
+    val hops = hopsAggregate(endTimes.min, resolution, incAgg)
 
     hops
       .flatMap {
@@ -356,12 +364,43 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
     toDf(outputRdd, Seq(Constants.TimeColumn -> LongType, tableUtils.partitionColumn -> StringType))
   }
 
+  //def dfToOutputArrayType(df: DataFrame): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+  //  val keyBuilder: Row => KeyWithHash =
+  //    FastHashing.generateKeyBuilder(keyColumns.toArray, df.schema)
+
+  //  df.rdd
+  //    .keyBy(keyBuilder)
+  //    .mapValues(SparkConversions.toChrononRow(_, tsIndex))
+  //    .mapValues(windowAggregator.toTimeSortedArray)
+  //}
+
+  def flattenOutputArrayType(hopsArrays: RDD[(KeyWithHash, HopsAggregator.OutputArrayType)]): RDD[(Array[Any], Array[Any])] = {
+    hopsArrays.flatMap { case (keyWithHash: KeyWithHash, hopsArray: HopsAggregator.OutputArrayType) =>
+      val hopsArrayHead: Array[HopIr] = hopsArray.headOption.get
+      hopsArrayHead.map { array: HopIr =>
+        // the last element is a timestamp, we need to drop it
+        // and add it to the key
+        val timestamp = array.last.asInstanceOf[Long]
+        val withoutTimestamp = array.dropRight(1)
+        ((keyWithHash.data :+ tableUtils.partitionSpec.at(timestamp)), withoutTimestamp)
+      }
+    }
+  }
+
+  def convertHopsToDf(range: PartitionRange,
+                             schema: Array[(String, ai.chronon.api.DataType)]
+                            ): DataFrame = {
+    val hops = hopsAggregate(range.toTimePoints.min, DailyResolution)
+    val hopsDf = flattenOutputArrayType(hops)
+    toDf(hopsDf, Seq((tableUtils.partitionColumn, StringType)), Some(SparkConversions.fromChrononSchema(schema)))
+  }
+
   // convert raw data into IRs, collected by hopSizes
   // TODO cache this into a table: interface below
   // Class HopsCacher(keySchema, irSchema, resolution) extends RddCacher[(KeyWithHash, HopsOutput)]
   //  buildTableRow((keyWithHash, hopsOutput)) -> GenericRowWithSchema
   //  buildRddRow(GenericRowWithSchema) -> (keyWithHash, hopsOutput)
-  def hopsAggregate(minQueryTs: Long, resolution: Resolution): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
+  def hopsAggregate(minQueryTs: Long, resolution: Resolution, incAgg: Boolean = false): RDD[(KeyWithHash, HopsAggregator.OutputArrayType)] = {
     val hopsAggregator =
       new HopsAggregator(minQueryTs, aggregations, selectedSchema, resolution)
     val keyBuilder: Row => KeyWithHash =
@@ -378,9 +417,9 @@ class GroupBy(val aggregations: Seq[api.Aggregation],
   }
 
   protected[spark] def toDf(aggregateRdd: RDD[(Array[Any], Array[Any])],
-                            additionalFields: Seq[(String, DataType)]): DataFrame = {
+                            additionalFields: Seq[(String, DataType)], schema: Option[StructType] = None): DataFrame = {
     val finalKeySchema = StructType(keySchema ++ additionalFields.map { case (name, typ) => StructField(name, typ) })
-    KvRdd(aggregateRdd, finalKeySchema, postAggSchema).toFlatDf
+    KvRdd(aggregateRdd, finalKeySchema, schema.getOrElse(postAggSchema)).toFlatDf
   }
 
   private def normalizeOrFinalize(ir: Array[Any]): Array[Any] =
@@ -461,18 +500,19 @@ object GroupBy {
            bloomMapOpt: Option[util.Map[String, BloomFilter]] = None,
            skewFilter: Option[String] = None,
            finalize: Boolean = true,
-           showDf: Boolean = false): GroupBy = {
+           showDf: Boolean = false,
+           incrementalAgg: Boolean = false): GroupBy = {
     logger.info(s"\n----[Processing GroupBy: ${groupByConfOld.metaData.name}]----")
     val groupByConf = replaceJoinSource(groupByConfOld, queryRange, tableUtils, computeDependency, showDf)
     val inputDf = groupByConf.sources.toScala
       .map { source =>
         renderDataSourceQuery(groupByConf,
-                              source,
-                              groupByConf.getKeyColumns.toScala,
-                              queryRange,
-                              tableUtils,
-                              groupByConf.maxWindow,
-                              groupByConf.inferredAccuracy)
+          source,
+          groupByConf.getKeyColumns.toScala,
+          queryRange,
+          tableUtils,
+          groupByConf.maxWindow,
+          groupByConf.inferredAccuracy)
 
       }
       .map {
@@ -543,15 +583,20 @@ object GroupBy {
         logger.info(s"printing mutation data for groupBy: ${groupByConf.metaData.name}")
         df.prettyPrint()
       }
-
       df
     }
-
+    val finalizeValue = if (incrementalAgg) {
+      !incrementalAgg
+    } else {
+      finalize
+    }
     new GroupBy(Option(groupByConf.getAggregations).map(_.toScala).orNull,
-                keyColumns,
-                nullFiltered,
-                mutationDfFn,
-                finalize = finalize)
+        keyColumns,
+        nullFiltered,
+        mutationDfFn,
+        finalize = finalizeValue,
+      incAgg = incrementalAgg,
+    )
   }
 
   def getIntersectedRange(source: api.Source,
@@ -670,12 +715,54 @@ object GroupBy {
     query
   }
 
+  def saveAndGetIncDf(
+                       groupByConf: api.GroupBy,
+                       range: PartitionRange,
+                       tableUtils: TableUtils,
+                     ): GroupBy = {
+    val incOutputTable = groupByConf.metaData.incOutputTable
+    val tableProps = Option(groupByConf.metaData.tableProperties)
+      .map(_.toScala)
+      .orNull
+    //range should be modified to incremental range
+    val incGroupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true, incrementalAgg = true)
+    val selectedSchema = incGroupByBackfill.selectedSchema
+    //TODO is there any other way to get incSchema?
+    val incSchema = new RowAggregator(selectedSchema, incGroupByBackfill.aggregations.flatMap(_.unWindowed)).incSchema
+    val hopsDf = incGroupByBackfill.convertHopsToDf(range, incSchema)
+    hopsDf.save(incOutputTable, tableProps)
+
+    val maxWindow = groupByConf.maxWindow.get
+    val sourceQueryableRange = PartitionRange(
+      tableUtils.partitionSpec.minus(range.start, maxWindow),
+      range.end
+    )(tableUtils)
+
+    val incTableFirstPartition: Option[String] = tableUtils.firstAvailablePartition(incOutputTable)
+    val incTableLastPartition: Option[String] = tableUtils.lastAvailablePartition(incOutputTable)
+
+    val incTableRange = PartitionRange(
+      incTableFirstPartition.get,
+      incTableLastPartition.get
+    )(tableUtils)
+
+    val incDfQuery = incTableRange.intersect(sourceQueryableRange).genScanQuery(null, incOutputTable)
+    val incDf: DataFrame = tableUtils.sql(incDfQuery)
+
+    new GroupBy(
+      incGroupByBackfill.aggregations,
+      incGroupByBackfill.keyColumns,
+      incDf
+    )
+  }
+
   def computeBackfill(groupByConf: api.GroupBy,
                       endPartition: String,
                       tableUtils: TableUtils,
                       stepDays: Option[Int] = None,
                       overrideStartPartition: Option[String] = None,
-                      skipFirstHole: Boolean = true): Unit = {
+                      skipFirstHole: Boolean = true,
+                      incrementalAgg: Boolean = true): Unit = {
     assert(
       groupByConf.backfillStartDate != null,
       s"GroupBy:${groupByConf.metaData.name} has null backfillStartDate. This needs to be set for offline backfilling.")
@@ -714,7 +801,12 @@ object GroupBy {
           stepRanges.zipWithIndex.foreach {
             case (range, index) =>
               logger.info(s"Computing group by for range: $range [${index + 1}/${stepRanges.size}]")
-              val groupByBackfill = from(groupByConf, range, tableUtils, computeDependency = true)
+              val groupByBackfill = if (incrementalAgg) {
+                saveAndGetIncDf(groupByConf, range, tableUtils)
+                //from(groupByConf, range, tableUtils, computeDependency = true)
+              } else {
+                from(groupByConf, range, tableUtils, computeDependency = true)
+              }
               val outputDf = groupByConf.dataModel match {
                 // group by backfills have to be snapshot only
                 case Entities => groupByBackfill.snapshotEntities