add data dictionary export

tdroberto · tdroberto · commit bef86e0025f9 · 2025-04-10T09:31:33.000+09:00
diff --git a/scenarios/data_dictionary/README.md b/scenarios/data_dictionary/README.md
@@ -0,0 +1,21 @@
+# Data Dictionary
+
+----
+## Overview
+
+This project provides a solution to exporting multiple tables as data dictionary, either a defined database or a master segment.
+
+----
+## Implementation
+1. Copy and paste the code into a custom script in Treasure Workflows.
+2. Fill in the blanks.
+
+----
+## Considerations
+
+N/A
+
+----
+## Questions
+
+Please feel free to reach out to apac-se@treasure-data.com with any questions you have about using this code.
diff --git a/scenarios/data_dictionary/config/params.yaml b/scenarios/data_dictionary/config/params.yaml
@@ -0,0 +1,9 @@
+#config/params.yaml
+td_api_ep: https://api.treasuredata.co.jp
+ms_config_ep: https://api-cdp.treasuredata.co.jp/audiences/
+ms_id: <master segment id>
+temp_db: master_segment_data_dictionary_db_<master segment id>
+temp_schema_tbl: ms_schema
+temp_ms_conf_tbl: ms_conf
+folder_id: <folder id>
+gsheets_conn: <connection name>
diff --git a/scenarios/data_dictionary/db_dd.dig b/scenarios/data_dictionary/db_dd.dig
@@ -0,0 +1,20 @@
+#db_dd.dig
+timezone: Asia/Tokyo
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: cdp_audience_${ms_id}
+
++for_each_table:
+  td_for_each>: queries/db_tables.sql
+  _do:
+    +to_gsheets:
+      td>: queries/db_cols.sql
+      result_connection: ${gsheets_conn}
+      result_settings:
+        spreadsheet_folder: ${folder_id}
+        spreadsheet_title: ${td.database}
+        sheet_title: ${td.each.dname}
+        mode: replace
diff --git a/scenarios/data_dictionary/ms_dd.dig b/scenarios/data_dictionary/ms_dd.dig
@@ -0,0 +1,31 @@
+#ms_dd.dig
+timezone: Asia/Calcutta
+
+_export:
+  !include : config/params.yaml
+  td:
+    engine: presto
+    database: ${temp_db}
+
++create_temp_db:
+  td_ddl>:
+  create_databases: [ "${temp_db}" ]
+
++run_py:
+  py>: scripts.ms_dd.main
+  _env:
+    TD_API_KEY: ${secret:ap.apikey} 
+  docker:
+    image: "digdag/digdag-python:3.10.1"
+
++for_each_table:
+  td_for_each>: queries/tables.sql
+  _do:
+    +to_gsheets:
+      td>: queries/cols.sql
+      result_connection: ${gsheets_conn}
+      result_settings:
+        spreadsheet_folder: ${folder_id}
+        spreadsheet_title: ap_master_segment_enriched_prd
+        sheet_title: ${td.each.db}.${td.each.tbl}
+        mode: replace
diff --git a/scenarios/data_dictionary/queries/db_cols.sql b/scenarios/data_dictionary/queries/db_cols.sql
@@ -0,0 +1,12 @@
+--queries/db_cols.sql
+SELECT
+  column_name,
+  ordinal_position,
+  column_default,
+  is_nullable,
+  data_type
+FROM
+  information_schema.columns
+WHERE
+  table_schema = '${td.database}'
+  AND table_name = '${td.each.table_name}'
diff --git a/scenarios/data_dictionary/queries/db_tables.sql b/scenarios/data_dictionary/queries/db_tables.sql
@@ -0,0 +1,10 @@
+--queries/db_tables.sql
+SELECT
+  table_name,
+  SUBSTR(TO_BASE64(SHA256(CAST(table_name AS VARBINARY))), 1, 25) AS dname -- MS xlsx sheet name limitation (hash match)
+FROM
+  information_schema.tables
+WHERE
+  table_schema = '${td.database}'
+ORDER BY
+  table_name ASC
diff --git a/scenarios/data_dictionary/queries/ms_cols.sql b/scenarios/data_dictionary/queries/ms_cols.sql
@@ -0,0 +1,11 @@
+--queries/ms_cols.sql
+SELECT
+  "column" AS Column,
+  column_alias AS Alias,
+  "type" AS "Data Type",
+  comment AS Comment
+FROM
+  ${temp_db}.${temp_schema_tbl}
+WHERE
+  "database" = '${td.each.db}'
+  AND "table" = '${td.each.tbl}'
diff --git a/scenarios/data_dictionary/queries/ms_tables.sql b/scenarios/data_dictionary/queries/ms_tables.sql
@@ -0,0 +1,9 @@
+--queries/ms_tables.sql
+SELECT
+  "database" AS db,
+  "table" AS tbl
+FROM
+  ${temp_db}.${temp_schema_tbl}
+GROUP BY
+  "database",
+  "table"
diff --git a/scenarios/data_dictionary/scripts/ms_dd.py b/scenarios/data_dictionary/scripts/ms_dd.py
@@ -0,0 +1,123 @@
+#scripts/ms_dd.py
+import os
+import pandas as pd
+import pytd
+import requests
+
+def main(**kwargs):
+  # passed params
+  td_api_key = os.getenv('TD_API_KEY')
+  # params from config.yml
+  td_api_ep = kwargs.get('td_api_ep')
+  ms_config_ep = kwargs.get('ms_config_ep')
+  ms_id = kwargs.get('ms_id')
+  temp_db = kwargs.get('temp_db')
+  temp_schema_tbl = kwargs.get('temp_schema_tbl')
+  temp_ms_conf_tbl = kwargs.get('temp_ms_conf_tbl')
+  # Init import from / export to TD table
+  td = pytd.Client(apikey = td_api_key, 
+                  endpoint = td_api_ep, 
+                  database = temp_db, 
+                  default_engine = 'presto')
+
+  # Fetch master segment config
+  url = ms_config_ep + str(ms_id)
+  headers = {'Authorization': f'TD1 {td_api_key}'}
+  res = requests.get(url, headers = headers)
+  ms_conf = res.json()
+
+  # Extract and store master segment's name
+  ms_name = ms_conf.get('name', 'N/A')
+  ms_conf_df = pd.DataFrame({'name': [ms_name]})
+  td.load_table_from_dataframe(ms_conf_df, f'{temp_db}.{temp_ms_conf_tbl}', writer='bulk_import', if_exists='overwrite')
+
+  # Extract master table database and table
+  master_db = ms_conf.get('master', {}).get('parentDatabaseName', None)
+  master_tbl = ms_conf.get('master', {}).get('parentTableName', None)
+  # Fetch master table schema
+  # Describe schema: Column, Type, Extra, Comment
+  master_res = td.query(f'DESCRIBE {master_db}.{master_tbl}')
+  master_df = pd.DataFrame(**master_res)
+  master_df = master_df.drop('Extra', axis=1)
+  master_df['database'] = master_db
+  master_df['table'] = master_tbl
+  master_df['column_alias'] = ''
+  # Send data to TD table
+  td.load_table_from_dataframe(master_df, f'{temp_db}.{temp_schema_tbl}', writer='bulk_import', if_exists='overwrite')
+
+  # Extract database, table, and columns for attribute tables
+  # Unique db, tbl set
+  attribute_tbls = {
+    (attr_tbl['parentDatabaseName'], attr_tbl['parentTableName'])
+    for attr_tbl in ms_conf.get('attributes', [])
+  }
+  # All db, table, column alias, column name list
+  attribute_tbls_cols = [
+    (attr['parentDatabaseName'], attr['parentTableName'], attr['name'], attr['parentColumn'])
+    for attr in ms_conf.get('attributes', [])
+  ]
+  # Build attribute table schema
+  for attr_tbls_db, attr_tbls_tbl in attribute_tbls:   
+    attr_res = td.query(f'DESCRIBE {attr_tbls_db}.{attr_tbls_tbl}')
+    attr_df = pd.DataFrame(**attr_res)
+    attr_dict_list = []
+    for attr_db, attr_tbl, col_alias, col_name in attribute_tbls_cols:
+      if attr_db == attr_tbls_db and attr_tbl == attr_tbls_tbl:
+        attr_dict = {
+            'database': attr_db,
+            'table': attr_tbl,
+            'column_alias': col_alias,
+            'column': col_name,
+            'type': attr_df[attr_df['Column'] == col_name].iloc[0,1],
+            'comment': attr_df[attr_df['Column'] == col_name].iloc[0,3],
+        }
+        attr_dict_list.append(attr_dict)
+    attr_df = pd.DataFrame(attr_dict_list)
+    # Send data to TD table
+    td.load_table_from_dataframe(attr_df, f'{temp_db}.{temp_schema_tbl}', writer='bulk_import', if_exists='append')
+
+  # Extract database, table, and columns for behavior tables
+  # Unique db, tbl set
+  behavior_tbls = {
+    (behav_tbl['parentDatabaseName'], behav_tbl['parentTableName'])
+    for behav_tbl in ms_conf.get('behaviors', [])
+  }
+  # All db, table, column alias, column name list
+  behavior_tbls_cols = []
+  for behav in ms_conf.get('behaviors', []):
+    behav_db = behav['parentDatabaseName']
+    behav_tbl = behav['parentTableName']
+    if behav.get('allColumns', True):
+      behav_res = td.query(f'DESCRIBE {behav_db}.{behav_tbl}')
+      behav_df = pd.DataFrame(**behav_res)
+      for row in behav_df.iterrows():
+        behavior_tbls_cols.append([behav_db, behav_tbl, '', row[1]['Column']])
+    else:
+      behav_schema = [
+              (behav_db, behav_tbl, schema['name'], schema['parentColumn'])
+              for schema in behav.get('schema', [])
+          ]
+      behavior_tbls_cols.extend(behav_schema)
+  # Build behavior table schema
+  for behav_tbls_db, behav_tbls_tbl in behavior_tbls:   
+    behav_res = td.query(f'DESCRIBE {behav_tbls_db}.{behav_tbls_tbl}')
+    behav_df = pd.DataFrame(**behav_res)
+    behav_dict_list = []
+    for behav_db, behav_tbl, col_alias, col_name in behavior_tbls_cols:
+      if behav_db == behav_tbls_db and behav_tbl == behav_tbls_tbl:
+        behav_dict = {
+            'database': behav_db,
+            'table': behav_tbl,
+            'column_alias': col_alias,
+            'column': col_name,
+            'type': behav_df[behav_df['Column'] == col_name].iloc[0,1],
+            'comment': behav_df[behav_df['Column'] == col_name].iloc[0,3],
+        }
+        behav_dict_list.append(behav_dict)
+    behav_df = pd.DataFrame(behav_dict_list)
+    # Send data to TD table
+    td.load_table_from_dataframe(behav_df, f'{temp_db}.{temp_schema_tbl}', writer='bulk_import', if_exists='append')
+
+# Main
+if __name__ == "__main__":
+    main()