|
33 | 33 | from dlt.common.metrics import LoadJobMetrics
|
34 | 34 | from dlt.common.normalizers.naming import NamingConvention
|
35 | 35 |
|
36 |
| -from dlt.common.schema import Schema, TSchemaTables |
| 36 | +from dlt.common.schema import Schema, TSchemaTables, TSchemaDrop |
37 | 37 | from dlt.common.schema.typing import (
|
| 38 | + C_DLT_ID, |
38 | 39 | C_DLT_LOAD_ID,
|
39 | 40 | TLoaderReplaceStrategy,
|
40 | 41 | TTableFormat,
|
| 42 | + TTableSchemaColumns, |
| 43 | + TPartialTableSchema, |
41 | 44 | )
|
42 | 45 | from dlt.common.destination.capabilities import DestinationCapabilitiesContext
|
43 | 46 | from dlt.common.destination.exceptions import (
|
@@ -539,36 +542,120 @@ def update_stored_schema(
|
539 | 542 | )
|
540 | 543 | return expected_update
|
541 | 544 |
|
542 |
| - def update_stored_schema_destructively( |
| 545 | + def update_dlt_schema( |
543 | 546 | self,
|
544 |
| - ) -> None: |
545 |
| - """ |
546 |
| - Compare the schema we think we should have (`self.schema`) |
547 |
| - with what actually exists in the destination, and drop any |
548 |
| - columns that disappeared. |
549 |
| - """ |
550 |
| - for table in self.schema.data_tables(): |
551 |
| - table_name = table["name"] |
552 |
| - |
553 |
| - actual_columns = self._get_actual_columns(table_name) |
554 |
| - schema_columns = self.schema.get_table_columns(table_name) |
555 |
| - dropped_columns = set(schema_columns.keys()) - set(actual_columns) |
556 |
| - if dropped_columns: |
557 |
| - for dropped_col in dropped_columns: |
558 |
| - if schema_columns[dropped_col].get("increment"): |
559 |
| - logger.warning( |
560 |
| - "An incremental field is being removed from schema." |
561 |
| - "You should unset the" |
562 |
| - " incremental with `incremental=dlt.sources.incremental.EMPTY`" |
563 |
| - ) |
564 |
| - self.schema.drop_columns(table_name, list(dropped_columns)) |
565 |
| - |
566 |
| - def _get_actual_columns(self, table_name: str) -> List[str]: # noqa: B027, optional override |
567 |
| - """ |
568 |
| - Return a list of column names that currently exist in the |
569 |
| - destination for `table_name`. |
| 547 | + table_names: Iterable[str] = None, |
| 548 | + dry_run: bool = False, |
| 549 | + ) -> Optional[TSchemaDrop]: |
| 550 | + """Updates schema to the storage. |
| 551 | +
|
| 552 | + Compare the schema we think we should have (`self.schema`) with what actually exists in the destination, |
| 553 | + and drop any tables and/or columns that disappeared. |
| 554 | +
|
| 555 | + Args: |
| 556 | + table_names (Iterable[str], optional): Check only listed tables. Defaults to None and checks all tables. |
| 557 | +
|
| 558 | + Returns: |
| 559 | + Optional[TSchemaTables]: Returns an update that was applied to the schema. |
570 | 560 | """
|
571 |
| - pass |
| 561 | + from dlt.destinations.sql_client import WithSqlClient |
| 562 | + |
| 563 | + if not (isinstance(self, WithTableReflection) and isinstance(self, WithSqlClient)): |
| 564 | + raise NotImplementedError |
| 565 | + |
| 566 | + def _diff_between_actual_and_dlt_schema( |
| 567 | + table_name: str, actual_col_names: set[str], disregard_dlt_columns: bool = True |
| 568 | + ) -> TPartialTableSchema: |
| 569 | + """Returns a partial table schema containing columns that exist in the dlt schema |
| 570 | + but are missing from the actual table. Skips dlt internal columns by default. |
| 571 | + """ |
| 572 | + col_schemas = self.schema.get_table_columns(table_name) |
| 573 | + |
| 574 | + # Map escaped -> original names (actual_col_names are escaped) |
| 575 | + escaped_to_original = { |
| 576 | + self.sql_client.escape_column_name(col, quote=False): col |
| 577 | + for col in col_schemas.keys() |
| 578 | + } |
| 579 | + dropped_col_names = set(escaped_to_original.keys()) - actual_col_names |
| 580 | + |
| 581 | + if not dropped_col_names: |
| 582 | + return {} |
| 583 | + |
| 584 | + partial_table: TPartialTableSchema = {"name": table_name, "columns": {}} |
| 585 | + |
| 586 | + for esc_name in dropped_col_names: |
| 587 | + orig_name = escaped_to_original[esc_name] |
| 588 | + |
| 589 | + # Athena doesn't have dlt columns in actual columns. Don't drop them anyway. |
| 590 | + if disregard_dlt_columns and orig_name in [C_DLT_ID, C_DLT_LOAD_ID]: |
| 591 | + continue |
| 592 | + |
| 593 | + col_schema = col_schemas[orig_name] |
| 594 | + if col_schema.get("increment"): |
| 595 | + # We can warn within the for loop, |
| 596 | + # since there's only one incremental field per table |
| 597 | + logger.warning( |
| 598 | + f"An incremental field {orig_name} is being removed from schema." |
| 599 | + "You should unset the" |
| 600 | + " incremental with `incremental=dlt.sources.incremental.EMPTY`" |
| 601 | + ) |
| 602 | + partial_table["columns"][orig_name] = col_schema |
| 603 | + |
| 604 | + return partial_table if partial_table["columns"] else {} |
| 605 | + |
| 606 | + tables = table_names if table_names else self.schema.data_table_names() |
| 607 | + |
| 608 | + table_drops: TSchemaDrop = {} # includes entire tables to drop |
| 609 | + column_drops: TSchemaDrop = {} # includes parts of tables to drop as partial tables |
| 610 | + |
| 611 | + # 1. Detect what needs to be dropped |
| 612 | + for table_name in tables: |
| 613 | + _, actual_col_schemas = list(self.get_storage_tables([table_name]))[0] |
| 614 | + |
| 615 | + # no actual column schemas -> |
| 616 | + # table doesn't exist -> |
| 617 | + # we take entire table schema as a schema drop |
| 618 | + if not actual_col_schemas: |
| 619 | + table = self.schema.get_table(table_name) |
| 620 | + table_drops[table_name] = table |
| 621 | + continue |
| 622 | + |
| 623 | + # actual column schemas present -> |
| 624 | + # we compare actual schemas with dlt ones -> |
| 625 | + # we take the difference as a partial table |
| 626 | + else: |
| 627 | + partial_table = _diff_between_actual_and_dlt_schema( |
| 628 | + table_name, |
| 629 | + set(actual_col_schemas.keys()), |
| 630 | + ) |
| 631 | + if partial_table: |
| 632 | + column_drops[table_name] = partial_table |
| 633 | + |
| 634 | + # 2. For entire table drops, we make sure no orphaned tables remain |
| 635 | + for table_name in table_drops.copy(): |
| 636 | + child_tables = self.schema.get_child_tables(table_name) |
| 637 | + orphaned_table_names: List[str] = [] |
| 638 | + for child_table in child_tables: |
| 639 | + if child_table["name"] not in table_drops: |
| 640 | + orphaned_table_names.append(child_table["name"]) |
| 641 | + if orphaned_table_names: |
| 642 | + table_drops.pop(table_name) |
| 643 | + logger.warning( |
| 644 | + f"Removing table '{table_name}' from the dlt schema would leave orphan" |
| 645 | + f" table(s): {'.'.join(repr(t) for t in orphaned_table_names)}. Drop these" |
| 646 | + " child tables in the destination and sync the dlt schema again." |
| 647 | + ) |
| 648 | + |
| 649 | + # 3. If it's not a dry run, we actually drop fromt the dlt schema |
| 650 | + if not dry_run: |
| 651 | + for table_name in table_drops: |
| 652 | + self.schema.tables.pop(table_name) |
| 653 | + for table_name, partial_table in column_drops.items(): |
| 654 | + col_schemas = partial_table["columns"] |
| 655 | + col_names = [col for col in col_schemas] |
| 656 | + self.schema.drop_columns(table_name, col_names) |
| 657 | + |
| 658 | + return {**table_drops, **column_drops} |
572 | 659 |
|
573 | 660 | def prepare_load_table(self, table_name: str) -> PreparedTableSchema:
|
574 | 661 | """Prepares a table schema to be loaded by filling missing hints and doing other modifications requires by given destination.
|
@@ -639,6 +726,22 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
|
639 | 726 | pass
|
640 | 727 |
|
641 | 728 |
|
| 729 | +class WithTableReflection(ABC): |
| 730 | + @abstractmethod |
| 731 | + def get_storage_tables( |
| 732 | + self, table_names: Iterable[str] |
| 733 | + ) -> Iterable[Tuple[str, TTableSchemaColumns]]: |
| 734 | + """Uses INFORMATION_SCHEMA to retrieve table and column information for tables in `table_names` iterator. |
| 735 | + Table names should be normalized according to naming convention and will be further converted to desired casing |
| 736 | + in order to (in most cases) create case-insensitive name suitable for search in information schema. |
| 737 | +
|
| 738 | + The column names are returned as in information schema. To match those with columns in existing table, you'll need to use |
| 739 | + `schema.get_new_table_columns` method and pass the correct casing. Most of the casing function are irreversible so it is not |
| 740 | + possible to convert identifiers into INFORMATION SCHEMA back into case sensitive dlt schema. |
| 741 | + """ |
| 742 | + pass |
| 743 | + |
| 744 | + |
642 | 745 | class WithStagingDataset(ABC):
|
643 | 746 | """Adds capability to use staging dataset and request it from the loader"""
|
644 | 747 |
|
|
0 commit comments