File tree Expand file tree Collapse file tree 6 files changed +700
-0
lines changed
data-box/deduplication_deterministic_probabilistic Expand file tree Collapse file tree 6 files changed +700
-0
lines changed Original file line number Diff line number Diff line change
1
+ # Deterministic and probabilistic deduplication
2
+
3
+ ----
4
+ ## Overview
5
+
6
+ This project provides a solution to deterministically and probabilistically deduplicate a given dataset that has no reliable identifier.
7
+
8
+ ----
9
+ ## Implementation
10
+ 1 . Copy and paste this code into Treasure Workflows or run it with TD toolbelt.
11
+ 2 . Set your TD master key as the workflow secret.
12
+ 3 . Change the database and tables in the config/params.yaml file.
13
+
14
+ ----
15
+ ## Considerations
16
+
17
+ This project was developed for a Vietnamese automobile customer. Consider changing the cleanse.sql accordingly to normalize characters and the variables in the scripts to better suit your needs.
18
+
19
+ The probabilistic matching script (pm.py) uses multiprocessing, consider changing the settings according to your dataset size.
20
+
21
+ ----
22
+ ## Questions
23
+
24
+ Please feel free to reach out to
[email protected] with any questions you have about using this code.
Original file line number Diff line number Diff line change
1
+ db : rotd_dm_pm
2
+ source_tbl : customers
3
+ clean_tbl : clean_sap_customer
4
+ dm_tbl : dm_dedup
5
+ pm_tbl : pm_dedup_eval
6
+ api_endpoint : https://api.treasuredata.com
7
+ positive_threshold : 80
8
+ name_weight : 1
9
+ address_weight : 1
10
+ partition_cnt : 16
11
+ process_cnt : 8
Original file line number Diff line number Diff line change
1
+ timezone: Asia/Tokyo
2
+
3
+ _export:
4
+ !include : config/params.yaml
5
+ td:
6
+ engine: presto
7
+ database: ${db}
8
+
9
+ +create_db_tbl_if_not_exists:
10
+ td_ddl>:
11
+ create_databases: [ "${db}" ]
12
+ create_tables: [ "${pm_tbl}" ]
13
+ empty_tables: [ "${clean_tbl}", "${dm_tbl}", "${pm_tbl}" ]
14
+
15
+ +create_db_tbl_if_not_exists:
16
+ td_ddl>:
17
+ create_databases: [ "${db}" ]
18
+ create_tables: [ "${clean_tbl}", "${dm_tbl}", "${pm_tbl}" ]
19
+ empty_tables: [ "${clean_tbl}", "${dm_tbl}", "${pm_tbl}" ]
20
+
21
+ +cleansing:
22
+ td>: queries/cleanse.sql
23
+ create_table: ${clean_tbl}
24
+
25
+ # Deterministic Matching at tier 4
26
+ +dm:
27
+ py>: scripts.dm.main
28
+ _env:
29
+ TD_API_KEY: ${secret:td.apikey}
30
+ TD_API_ENDPOINT: ${api_endpoint}
31
+ docker:
32
+ image: "digdag/digdag-python:3.10"
33
+ resource_level: "4"
34
+
35
+ # Probabilistic Matching with multi-processing & multiple tasks at tier 4
36
+ +pm:
37
+ loop>: ${partition_cnt}
38
+ _parallel: false
39
+ _do:
40
+ +run_serial:
41
+ py>: scripts.pm.main
42
+ _env:
43
+ TD_API_KEY: ${secret:td.apikey}
44
+ TD_API_ENDPOINT: ${api_endpoint}
45
+ PROCESS_CNT: ${process_cnt}
46
+ PART: ${i}
47
+ PARTS: ${partition_cnt}
48
+ docker:
49
+ image: "digdag/digdag-python:3.10"
50
+ resource_level: "4"
Original file line number Diff line number Diff line change
1
+ SELECT
2
+ TRIM (CAST(customer_code AS VARCHAR )) AS cl_cc,
3
+ regexp_replace(TRIM (mobile_phone), ' \s +' , ' ' ) AS cl_phone,
4
+ regexp_replace(TRIM (identification_number), ' \s +' , ' ' ) AS cl_vin,
5
+ LOWER (TRIM (email)) AS cl_email,
6
+ regexp_replace(
7
+ regexp_replace(
8
+ regexp_replace(
9
+ regexp_replace(
10
+ regexp_replace(
11
+ regexp_replace(
12
+ regexp_replace(
13
+ regexp_replace(
14
+ LOWER (TRIM (customer_name)),
15
+ ' \s +' ,
16
+ ' '
17
+ ),
18
+ ' [áàảãạăắằẳẵặâấầẩẫậ]' ,
19
+ ' a'
20
+ ),
21
+ ' [éèẻẽẹêếềểễệ]' ,
22
+ ' e'
23
+ ),
24
+ ' [íìỉĩị]' ,
25
+ ' i'
26
+ ),
27
+ ' [óòỏõọôốồổỗộơớờởỡợ]' ,
28
+ ' o'
29
+ ),
30
+ ' [úùủũụưứừửữự]' ,
31
+ ' u'
32
+ ),
33
+ ' [ýỳỷỹỵ]' ,
34
+ ' y'
35
+ ),
36
+ ' đ' ,
37
+ ' d'
38
+ ) AS cl_customer_name,
39
+ CONCAT(
40
+ regexp_replace(
41
+ regexp_replace(
42
+ regexp_replace(
43
+ regexp_replace(
44
+ regexp_replace(
45
+ regexp_replace(
46
+ regexp_replace(
47
+ regexp_replace(
48
+ LOWER (TRIM (city)),
49
+ ' \s +' ,
50
+ ' '
51
+ ),
52
+ ' [áàảãạăắằẳẵặâấầẩẫậ]' ,
53
+ ' a'
54
+ ),
55
+ ' [éèẻẽẹêếềểễệ]' ,
56
+ ' e'
57
+ ),
58
+ ' [íìỉĩị]' ,
59
+ ' i'
60
+ ),
61
+ ' [óòỏõọôốồổỗộơớờởỡợ]' ,
62
+ ' o'
63
+ ),
64
+ ' [úùủũụưứừửữự]' ,
65
+ ' u'
66
+ ),
67
+ ' [ýỳỷỹỵ]' ,
68
+ ' y'
69
+ ),
70
+ ' đ' ,
71
+ ' d'
72
+ ),
73
+ ' ;' ,
74
+ regexp_replace(
75
+ regexp_replace(
76
+ regexp_replace(
77
+ regexp_replace(
78
+ regexp_replace(
79
+ regexp_replace(
80
+ regexp_replace(
81
+ regexp_replace(
82
+ LOWER (TRIM (district)),
83
+ ' \s +' ,
84
+ ' '
85
+ ),
86
+ ' [áàảãạăắằẳẵặâấầẩẫậ]' ,
87
+ ' a'
88
+ ),
89
+ ' [éèẻẽẹêếềểễệ]' ,
90
+ ' e'
91
+ ),
92
+ ' [íìỉĩị]' ,
93
+ ' i'
94
+ ),
95
+ ' [óòỏõọôốồổỗộơớờởỡợ]' ,
96
+ ' o'
97
+ ),
98
+ ' [úùủũụưứừửữự]' ,
99
+ ' u'
100
+ ),
101
+ ' [ýỳỷỹỵ]' ,
102
+ ' y'
103
+ ),
104
+ ' đ' ,
105
+ ' d'
106
+ ),
107
+ ' ;' ,
108
+ regexp_replace(
109
+ regexp_replace(
110
+ regexp_replace(
111
+ regexp_replace(
112
+ regexp_replace(
113
+ regexp_replace(
114
+ regexp_replace(
115
+ regexp_replace(
116
+ LOWER (TRIM (customer_address)),
117
+ ' \s +' ,
118
+ ' '
119
+ ),
120
+ ' [áàảãạăắằẳẵặâấầẩẫậ]' ,
121
+ ' a'
122
+ ),
123
+ ' [éèẻẽẹêếềểễệ]' ,
124
+ ' e'
125
+ ),
126
+ ' [íìỉĩị]' ,
127
+ ' i'
128
+ ),
129
+ ' [óòỏõọôốồổỗộơớờởỡợ]' ,
130
+ ' o'
131
+ ),
132
+ ' [úùủũụưứừửữự]' ,
133
+ ' u'
134
+ ),
135
+ ' [ýỳỷỹỵ]' ,
136
+ ' y'
137
+ ),
138
+ ' đ' ,
139
+ ' d'
140
+ )) AS cl_full_address,
141
+ regexp_replace(
142
+ regexp_replace(
143
+ regexp_replace(
144
+ regexp_replace(
145
+ regexp_replace(
146
+ regexp_replace(
147
+ regexp_replace(
148
+ regexp_replace(
149
+ LOWER (TRIM (city)),
150
+ ' \s +' ,
151
+ ' '
152
+ ),
153
+ ' [áàảãạăắằẳẵặâấầẩẫậ]' ,
154
+ ' a'
155
+ ),
156
+ ' [éèẻẽẹêếềểễệ]' ,
157
+ ' e'
158
+ ),
159
+ ' [íìỉĩị]' ,
160
+ ' i'
161
+ ),
162
+ ' [óòỏõọôốồổỗộơớờởỡợ]' ,
163
+ ' o'
164
+ ),
165
+ ' [úùủũụưứừửữự]' ,
166
+ ' u'
167
+ ),
168
+ ' [ýỳỷỹỵ]' ,
169
+ ' y'
170
+ ),
171
+ ' đ' ,
172
+ ' d'
173
+ ) AS cl_city,
174
+ *
175
+ FROM
176
+ ${db}.${source_tbl}
You can’t perform that action at this time.
0 commit comments