Skip to content

Commit bef86e0

Browse files
committed
add data dictionary export
1 parent 4b9870b commit bef86e0

File tree

9 files changed

+246
-0
lines changed

9 files changed

+246
-0
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Data Dictionary
2+
3+
----
4+
## Overview
5+
6+
This project provides a solution to exporting multiple tables as data dictionary, either a defined database or a master segment.
7+
8+
----
9+
## Implementation
10+
1. Copy and paste the code into a custom script in Treasure Workflows.
11+
2. Fill in the blanks.
12+
13+
----
14+
## Considerations
15+
16+
N/A
17+
18+
----
19+
## Questions
20+
21+
Please feel free to reach out to [email protected] with any questions you have about using this code.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#config/params.yaml
2+
td_api_ep: https://api.treasuredata.co.jp
3+
ms_config_ep: https://api-cdp.treasuredata.co.jp/audiences/
4+
ms_id: <master segment id>
5+
temp_db: master_segment_data_dictionary_db_<master segment id>
6+
temp_schema_tbl: ms_schema
7+
temp_ms_conf_tbl: ms_conf
8+
folder_id: <folder id>
9+
gsheets_conn: <connection name>
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#db_dd.dig
2+
timezone: Asia/Tokyo
3+
4+
_export:
5+
!include : config/params.yaml
6+
td:
7+
engine: presto
8+
database: cdp_audience_${ms_id}
9+
10+
+for_each_table:
11+
td_for_each>: queries/db_tables.sql
12+
_do:
13+
+to_gsheets:
14+
td>: queries/db_cols.sql
15+
result_connection: ${gsheets_conn}
16+
result_settings:
17+
spreadsheet_folder: ${folder_id}
18+
spreadsheet_title: ${td.database}
19+
sheet_title: ${td.each.dname}
20+
mode: replace
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#ms_dd.dig
2+
timezone: Asia/Calcutta
3+
4+
_export:
5+
!include : config/params.yaml
6+
td:
7+
engine: presto
8+
database: ${temp_db}
9+
10+
+create_temp_db:
11+
td_ddl>:
12+
create_databases: [ "${temp_db}" ]
13+
14+
+run_py:
15+
py>: scripts.ms_dd.main
16+
_env:
17+
TD_API_KEY: ${secret:ap.apikey}
18+
docker:
19+
image: "digdag/digdag-python:3.10.1"
20+
21+
+for_each_table:
22+
td_for_each>: queries/tables.sql
23+
_do:
24+
+to_gsheets:
25+
td>: queries/cols.sql
26+
result_connection: ${gsheets_conn}
27+
result_settings:
28+
spreadsheet_folder: ${folder_id}
29+
spreadsheet_title: ap_master_segment_enriched_prd
30+
sheet_title: ${td.each.db}.${td.each.tbl}
31+
mode: replace
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
--queries/db_cols.sql
2+
SELECT
3+
column_name,
4+
ordinal_position,
5+
column_default,
6+
is_nullable,
7+
data_type
8+
FROM
9+
information_schema.columns
10+
WHERE
11+
table_schema = '${td.database}'
12+
AND table_name = '${td.each.table_name}'
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
--queries/db_tables.sql
2+
SELECT
3+
table_name,
4+
SUBSTR(TO_BASE64(SHA256(CAST(table_name AS VARBINARY))), 1, 25) AS dname -- MS xlsx sheet name limitation (hash match)
5+
FROM
6+
information_schema.tables
7+
WHERE
8+
table_schema = '${td.database}'
9+
ORDER BY
10+
table_name ASC
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
--queries/ms_cols.sql
2+
SELECT
3+
"column" AS Column,
4+
column_alias AS Alias,
5+
"type" AS "Data Type",
6+
comment AS Comment
7+
FROM
8+
${temp_db}.${temp_schema_tbl}
9+
WHERE
10+
"database" = '${td.each.db}'
11+
AND "table" = '${td.each.tbl}'
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
--queries/ms_tables.sql
2+
SELECT
3+
"database" AS db,
4+
"table" AS tbl
5+
FROM
6+
${temp_db}.${temp_schema_tbl}
7+
GROUP BY
8+
"database",
9+
"table"
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#scripts/ms_dd.py
2+
import os
3+
import pandas as pd
4+
import pytd
5+
import requests
6+
7+
def main(**kwargs):
8+
# passed params
9+
td_api_key = os.getenv('TD_API_KEY')
10+
# params from config.yml
11+
td_api_ep = kwargs.get('td_api_ep')
12+
ms_config_ep = kwargs.get('ms_config_ep')
13+
ms_id = kwargs.get('ms_id')
14+
temp_db = kwargs.get('temp_db')
15+
temp_schema_tbl = kwargs.get('temp_schema_tbl')
16+
temp_ms_conf_tbl = kwargs.get('temp_ms_conf_tbl')
17+
# Init import from / export to TD table
18+
td = pytd.Client(apikey = td_api_key,
19+
endpoint = td_api_ep,
20+
database = temp_db,
21+
default_engine = 'presto')
22+
23+
# Fetch master segment config
24+
url = ms_config_ep + str(ms_id)
25+
headers = {'Authorization': f'TD1 {td_api_key}'}
26+
res = requests.get(url, headers = headers)
27+
ms_conf = res.json()
28+
29+
# Extract and store master segment's name
30+
ms_name = ms_conf.get('name', 'N/A')
31+
ms_conf_df = pd.DataFrame({'name': [ms_name]})
32+
td.load_table_from_dataframe(ms_conf_df, f'{temp_db}.{temp_ms_conf_tbl}', writer='bulk_import', if_exists='overwrite')
33+
34+
# Extract master table database and table
35+
master_db = ms_conf.get('master', {}).get('parentDatabaseName', None)
36+
master_tbl = ms_conf.get('master', {}).get('parentTableName', None)
37+
# Fetch master table schema
38+
# Describe schema: Column, Type, Extra, Comment
39+
master_res = td.query(f'DESCRIBE {master_db}.{master_tbl}')
40+
master_df = pd.DataFrame(**master_res)
41+
master_df = master_df.drop('Extra', axis=1)
42+
master_df['database'] = master_db
43+
master_df['table'] = master_tbl
44+
master_df['column_alias'] = ''
45+
# Send data to TD table
46+
td.load_table_from_dataframe(master_df, f'{temp_db}.{temp_schema_tbl}', writer='bulk_import', if_exists='overwrite')
47+
48+
# Extract database, table, and columns for attribute tables
49+
# Unique db, tbl set
50+
attribute_tbls = {
51+
(attr_tbl['parentDatabaseName'], attr_tbl['parentTableName'])
52+
for attr_tbl in ms_conf.get('attributes', [])
53+
}
54+
# All db, table, column alias, column name list
55+
attribute_tbls_cols = [
56+
(attr['parentDatabaseName'], attr['parentTableName'], attr['name'], attr['parentColumn'])
57+
for attr in ms_conf.get('attributes', [])
58+
]
59+
# Build attribute table schema
60+
for attr_tbls_db, attr_tbls_tbl in attribute_tbls:
61+
attr_res = td.query(f'DESCRIBE {attr_tbls_db}.{attr_tbls_tbl}')
62+
attr_df = pd.DataFrame(**attr_res)
63+
attr_dict_list = []
64+
for attr_db, attr_tbl, col_alias, col_name in attribute_tbls_cols:
65+
if attr_db == attr_tbls_db and attr_tbl == attr_tbls_tbl:
66+
attr_dict = {
67+
'database': attr_db,
68+
'table': attr_tbl,
69+
'column_alias': col_alias,
70+
'column': col_name,
71+
'type': attr_df[attr_df['Column'] == col_name].iloc[0,1],
72+
'comment': attr_df[attr_df['Column'] == col_name].iloc[0,3],
73+
}
74+
attr_dict_list.append(attr_dict)
75+
attr_df = pd.DataFrame(attr_dict_list)
76+
# Send data to TD table
77+
td.load_table_from_dataframe(attr_df, f'{temp_db}.{temp_schema_tbl}', writer='bulk_import', if_exists='append')
78+
79+
# Extract database, table, and columns for behavior tables
80+
# Unique db, tbl set
81+
behavior_tbls = {
82+
(behav_tbl['parentDatabaseName'], behav_tbl['parentTableName'])
83+
for behav_tbl in ms_conf.get('behaviors', [])
84+
}
85+
# All db, table, column alias, column name list
86+
behavior_tbls_cols = []
87+
for behav in ms_conf.get('behaviors', []):
88+
behav_db = behav['parentDatabaseName']
89+
behav_tbl = behav['parentTableName']
90+
if behav.get('allColumns', True):
91+
behav_res = td.query(f'DESCRIBE {behav_db}.{behav_tbl}')
92+
behav_df = pd.DataFrame(**behav_res)
93+
for row in behav_df.iterrows():
94+
behavior_tbls_cols.append([behav_db, behav_tbl, '', row[1]['Column']])
95+
else:
96+
behav_schema = [
97+
(behav_db, behav_tbl, schema['name'], schema['parentColumn'])
98+
for schema in behav.get('schema', [])
99+
]
100+
behavior_tbls_cols.extend(behav_schema)
101+
# Build behavior table schema
102+
for behav_tbls_db, behav_tbls_tbl in behavior_tbls:
103+
behav_res = td.query(f'DESCRIBE {behav_tbls_db}.{behav_tbls_tbl}')
104+
behav_df = pd.DataFrame(**behav_res)
105+
behav_dict_list = []
106+
for behav_db, behav_tbl, col_alias, col_name in behavior_tbls_cols:
107+
if behav_db == behav_tbls_db and behav_tbl == behav_tbls_tbl:
108+
behav_dict = {
109+
'database': behav_db,
110+
'table': behav_tbl,
111+
'column_alias': col_alias,
112+
'column': col_name,
113+
'type': behav_df[behav_df['Column'] == col_name].iloc[0,1],
114+
'comment': behav_df[behav_df['Column'] == col_name].iloc[0,3],
115+
}
116+
behav_dict_list.append(behav_dict)
117+
behav_df = pd.DataFrame(behav_dict_list)
118+
# Send data to TD table
119+
td.load_table_from_dataframe(behav_df, f'{temp_db}.{temp_schema_tbl}', writer='bulk_import', if_exists='append')
120+
121+
# Main
122+
if __name__ == "__main__":
123+
main()

0 commit comments

Comments
 (0)