Skip to content

Commit 3a7624b

Browse files
committed
add deterministic probabilistic deduplication solution
1 parent 9e6ca39 commit 3a7624b

File tree

6 files changed

+700
-0
lines changed

6 files changed

+700
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Deterministic and probabilistic deduplication
2+
3+
----
4+
## Overview
5+
6+
This project provides a solution to deterministically and probabilistically deduplicate a given dataset that has no reliable identifier.
7+
8+
----
9+
## Implementation
10+
1. Copy and paste this code into Treasure Workflows or run it with TD toolbelt.
11+
2. Set your TD master key as the workflow secret.
12+
3. Change the database and tables in the config/params.yaml file.
13+
14+
----
15+
## Considerations
16+
17+
This project was developed for a Vietnamese automobile customer. Consider changing the cleanse.sql accordingly to normalize characters and the variables in the scripts to better suit your needs.
18+
19+
The probabilistic matching script (pm.py) uses multiprocessing, consider changing the settings according to your dataset size.
20+
21+
----
22+
## Questions
23+
24+
Please feel free to reach out to [email protected] with any questions you have about using this code.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
db: rotd_dm_pm
2+
source_tbl: customers
3+
clean_tbl: clean_sap_customer
4+
dm_tbl: dm_dedup
5+
pm_tbl: pm_dedup_eval
6+
api_endpoint: https://api.treasuredata.com
7+
positive_threshold: 80
8+
name_weight: 1
9+
address_weight: 1
10+
partition_cnt: 16
11+
process_cnt: 8
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
timezone: Asia/Tokyo
2+
3+
_export:
4+
!include : config/params.yaml
5+
td:
6+
engine: presto
7+
database: ${db}
8+
9+
+create_db_tbl_if_not_exists:
10+
td_ddl>:
11+
create_databases: [ "${db}" ]
12+
create_tables: [ "${pm_tbl}" ]
13+
empty_tables: [ "${clean_tbl}", "${dm_tbl}", "${pm_tbl}" ]
14+
15+
+create_db_tbl_if_not_exists:
16+
td_ddl>:
17+
create_databases: [ "${db}" ]
18+
create_tables: [ "${clean_tbl}", "${dm_tbl}", "${pm_tbl}" ]
19+
empty_tables: [ "${clean_tbl}", "${dm_tbl}", "${pm_tbl}" ]
20+
21+
+cleansing:
22+
td>: queries/cleanse.sql
23+
create_table: ${clean_tbl}
24+
25+
# Deterministic Matching at tier 4
26+
+dm:
27+
py>: scripts.dm.main
28+
_env:
29+
TD_API_KEY: ${secret:td.apikey}
30+
TD_API_ENDPOINT: ${api_endpoint}
31+
docker:
32+
image: "digdag/digdag-python:3.10"
33+
resource_level: "4"
34+
35+
# Probabilistic Matching with multi-processing & multiple tasks at tier 4
36+
+pm:
37+
loop>: ${partition_cnt}
38+
_parallel: false
39+
_do:
40+
+run_serial:
41+
py>: scripts.pm.main
42+
_env:
43+
TD_API_KEY: ${secret:td.apikey}
44+
TD_API_ENDPOINT: ${api_endpoint}
45+
PROCESS_CNT: ${process_cnt}
46+
PART: ${i}
47+
PARTS: ${partition_cnt}
48+
docker:
49+
image: "digdag/digdag-python:3.10"
50+
resource_level: "4"
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
SELECT
2+
TRIM(CAST(customer_code AS VARCHAR)) AS cl_cc,
3+
regexp_replace(TRIM(mobile_phone), '\s+', '') AS cl_phone,
4+
regexp_replace(TRIM(identification_number), '\s+', '') AS cl_vin,
5+
LOWER(TRIM(email)) AS cl_email,
6+
regexp_replace(
7+
regexp_replace(
8+
regexp_replace(
9+
regexp_replace(
10+
regexp_replace(
11+
regexp_replace(
12+
regexp_replace(
13+
regexp_replace(
14+
LOWER(TRIM(customer_name)),
15+
'\s+',
16+
' '
17+
),
18+
'[áàảãạăắằẳẵặâấầẩẫậ]',
19+
'a'
20+
),
21+
'[éèẻẽẹêếềểễệ]',
22+
'e'
23+
),
24+
'[íìỉĩị]',
25+
'i'
26+
),
27+
'[óòỏõọôốồổỗộơớờởỡợ]',
28+
'o'
29+
),
30+
'[úùủũụưứừửữự]',
31+
'u'
32+
),
33+
'[ýỳỷỹỵ]',
34+
'y'
35+
),
36+
'đ',
37+
'd'
38+
) AS cl_customer_name,
39+
CONCAT(
40+
regexp_replace(
41+
regexp_replace(
42+
regexp_replace(
43+
regexp_replace(
44+
regexp_replace(
45+
regexp_replace(
46+
regexp_replace(
47+
regexp_replace(
48+
LOWER(TRIM(city)),
49+
'\s+',
50+
' '
51+
),
52+
'[áàảãạăắằẳẵặâấầẩẫậ]',
53+
'a'
54+
),
55+
'[éèẻẽẹêếềểễệ]',
56+
'e'
57+
),
58+
'[íìỉĩị]',
59+
'i'
60+
),
61+
'[óòỏõọôốồổỗộơớờởỡợ]',
62+
'o'
63+
),
64+
'[úùủũụưứừửữự]',
65+
'u'
66+
),
67+
'[ýỳỷỹỵ]',
68+
'y'
69+
),
70+
'đ',
71+
'd'
72+
),
73+
';',
74+
regexp_replace(
75+
regexp_replace(
76+
regexp_replace(
77+
regexp_replace(
78+
regexp_replace(
79+
regexp_replace(
80+
regexp_replace(
81+
regexp_replace(
82+
LOWER(TRIM(district)),
83+
'\s+',
84+
' '
85+
),
86+
'[áàảãạăắằẳẵặâấầẩẫậ]',
87+
'a'
88+
),
89+
'[éèẻẽẹêếềểễệ]',
90+
'e'
91+
),
92+
'[íìỉĩị]',
93+
'i'
94+
),
95+
'[óòỏõọôốồổỗộơớờởỡợ]',
96+
'o'
97+
),
98+
'[úùủũụưứừửữự]',
99+
'u'
100+
),
101+
'[ýỳỷỹỵ]',
102+
'y'
103+
),
104+
'đ',
105+
'd'
106+
),
107+
';',
108+
regexp_replace(
109+
regexp_replace(
110+
regexp_replace(
111+
regexp_replace(
112+
regexp_replace(
113+
regexp_replace(
114+
regexp_replace(
115+
regexp_replace(
116+
LOWER(TRIM(customer_address)),
117+
'\s+',
118+
' '
119+
),
120+
'[áàảãạăắằẳẵặâấầẩẫậ]',
121+
'a'
122+
),
123+
'[éèẻẽẹêếềểễệ]',
124+
'e'
125+
),
126+
'[íìỉĩị]',
127+
'i'
128+
),
129+
'[óòỏõọôốồổỗộơớờởỡợ]',
130+
'o'
131+
),
132+
'[úùủũụưứừửữự]',
133+
'u'
134+
),
135+
'[ýỳỷỹỵ]',
136+
'y'
137+
),
138+
'đ',
139+
'd'
140+
)) AS cl_full_address,
141+
regexp_replace(
142+
regexp_replace(
143+
regexp_replace(
144+
regexp_replace(
145+
regexp_replace(
146+
regexp_replace(
147+
regexp_replace(
148+
regexp_replace(
149+
LOWER(TRIM(city)),
150+
'\s+',
151+
' '
152+
),
153+
'[áàảãạăắằẳẵặâấầẩẫậ]',
154+
'a'
155+
),
156+
'[éèẻẽẹêếềểễệ]',
157+
'e'
158+
),
159+
'[íìỉĩị]',
160+
'i'
161+
),
162+
'[óòỏõọôốồổỗộơớờởỡợ]',
163+
'o'
164+
),
165+
'[úùủũụưứừửữự]',
166+
'u'
167+
),
168+
'[ýỳỷỹỵ]',
169+
'y'
170+
),
171+
'đ',
172+
'd'
173+
) AS cl_city,
174+
*
175+
FROM
176+
${db}.${source_tbl}

0 commit comments

Comments
 (0)