Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
- Fixed issue Silver Data Quality not working [PR](https://github.com/databrickslabs/dlt-meta/issues/156)
- Fixed issue Removed DPM flag check inside dataflowpipeline [PR](https://github.com/databrickslabs/dlt-meta/issues/177)
- Fixed issue Updated dlt-meta demos into Delta Live Tables Notebook github [PR](https://github.com/databrickslabs/dlt-meta/issues/158)
- Fixed issue Adding multiple col support for auto_cdc api [PR](https://github.com/databrickslabs/dlt-meta/pull/224)


## [v.0.0.9]
Expand Down
2 changes: 1 addition & 1 deletion docs/content/faq/execution.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ When you launch Lakeflow Declarative Pipeline it will read silver onboarding and
"keys":[
"customer_id"
],
"sequence_by":"dmsTimestamp",
"sequence_by":"dmsTimestamp,enqueueTimestamp,sequenceId",
"scd_type":"2",
"apply_as_deletes":"Op = 'D'",
"except_column_list":[
Expand Down
1 change: 1 addition & 0 deletions docs/content/releases/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ draft: false
- Fixed issue Changed dbfs.create to mkdirs for CLI: [PR](https://github.com/databrickslabs/dlt-meta/pull/53)
- Fixed issue DLT-META CLI should use pypi lib instead of whl : [PR](https://github.com/databrickslabs/dlt-meta/pull/79)
- Fixed issue Onboarding with multiple partition columns errors out: [PR](https://github.com/databrickslabs/dlt-meta/pull/134)
- Fixed issue Adding multiple col support for auto_cdc api [PR](https://github.com/databrickslabs/dlt-meta/pull/224)

# v0.0.8
## Enhancements
Expand Down
24 changes: 20 additions & 4 deletions src/dataflow_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import ast
import dlt
from pyspark.sql import DataFrame
from pyspark.sql.functions import expr
from pyspark.sql.functions import expr, struct
from pyspark.sql.types import StructType, StructField
from src.dataflow_spec import BronzeDataflowSpec, SilverDataflowSpec, DataflowSpecUtils
from src.pipeline_writers import AppendFlowWriter, DLTSinkWriter
Expand Down Expand Up @@ -630,11 +630,18 @@ def cdc_apply_changes(self):
target_table = (
f"{target_cl_name}{target_db_name}.{target_table_name}"
)

# Handle comma-separated sequence columns using struct
sequence_by = cdc_apply_changes.sequence_by
if ',' in sequence_by:
sequence_cols = [col.strip() for col in sequence_by.split(',')]
sequence_by = struct(*sequence_cols) # Use struct() from pyspark.sql.functions

dlt.create_auto_cdc_flow(
target=target_table,
source=self.view_name,
keys=cdc_apply_changes.keys,
sequence_by=cdc_apply_changes.sequence_by,
sequence_by=sequence_by,
where=cdc_apply_changes.where,
ignore_null_updates=cdc_apply_changes.ignore_null_updates,
apply_as_deletes=apply_as_deletes,
Expand Down Expand Up @@ -673,8 +680,17 @@ def modify_schema_for_cdc_changes(self, cdc_apply_changes):
for field in struct_schema.fields:
if field.name not in cdc_apply_changes.except_column_list:
modified_schema.add(field)
if field.name == cdc_apply_changes.sequence_by:
sequenced_by_data_type = field.dataType
# For SCD Type 2, get data type of first sequence column
sequence_by = cdc_apply_changes.sequence_by.strip()
if ',' not in sequence_by:
# Single column sequence
if field.name == sequence_by:
sequenced_by_data_type = field.dataType
else:
# Multiple column sequence - use first column's type
first_sequence_col = sequence_by.split(',')[0].strip()
if field.name == first_sequence_col:
sequenced_by_data_type = field.dataType
struct_schema = modified_schema
else:
raise Exception(f"Schema is None for {self.dataflowSpec} for cdc_apply_changes! ")
Expand Down
16 changes: 16 additions & 0 deletions tests/test_dataflow_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1363,6 +1363,22 @@ def test_write_bronze_cdc_apply_changes(self, mock_cdc_apply_changes):
pipeline.write_bronze()
assert mock_cdc_apply_changes.called

@patch.object(DataflowPipeline, 'cdc_apply_changes', return_value=None)
def test_write_bronze_cdc_apply_changes_multiple_sequence(self, mock_cdc_apply_changes):
"""Test write_bronze with CDC apply changes using multiple sequence columns."""
bronze_dataflow_spec = BronzeDataflowSpec(**self.bronze_dataflow_spec_map)
bronze_dataflow_spec.cdcApplyChanges = json.dumps({
"keys": ["id"],
"sequence_by": "event_timestamp, enqueue_timestamp, sequence_id",
"scd_type": "1",
"apply_as_deletes": "operation = 'DELETE'",
"except_column_list": ["operation", "event_timestamp", "enqueue_timestamp", "sequence_id", "_rescued_data"]
})
view_name = f"{bronze_dataflow_spec.targetDetails['table']}_inputview"
pipeline = DataflowPipeline(self.spark, bronze_dataflow_spec, view_name, None)
pipeline.write_bronze()
assert mock_cdc_apply_changes.called

@patch('pyspark.sql.SparkSession.readStream')
def test_get_silver_schema_uc_enabled(self, mock_read_stream):
"""Test get_silver_schema with Unity Catalog enabled."""
Expand Down
Loading