-
Notifications
You must be signed in to change notification settings - Fork 5
/
CreateNominalValues.ecl
155 lines (144 loc) · 5.72 KB
/
CreateNominalValues.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/**
* Convert one or more attributes within a dataset to dictionary lookup values.
* Macro accepts an optional old mapping (as if created from a previous
* execution run) can be passed in; if it is provided then those mappings will
* be reused.
*
* The goal here is to create nominal integer values for every unique value
* of interest, and also to provide a way to update an old nominal map with
* new data.
*
* This macro updates two attributes that you supply, one for the rewritten
* input dataset and one that will contain the mapping. The mapping will have
* the structure:
* RECORD
* STRING fieldName; // Name of attribute
* STRING valueAsString; // Value converted to a string
* UNSIGNED4 nom; // Nominal value
* END;
*
* The actual datatype used for the 'nom' mapping attribute can be easily
* changed; look for '%NomType%' in the code below.
*
* @param inFile IN, REQUIRED The dataset to process
* @param attrListStr IN, REQUIRED A comma-delimited list of
* attributes within the dataset to
* convert; the string should be
* in lowercase
* @param outFile OUT, REQUIRED The attribute that will contain
* the result of converting inFile
* @param outMap OUT, REQUIRED The attribute that will contain
* the new/updated mappings
* @param oldMap IN, OPTIONAL A dataset containing mappings
* from a previous execution; must have
* the record structure described above
*
* Origin: https://github.com/hpccsystems-solutions-lab/Useful_ECL
*/
EXPORT CreateNominalValues(inFile, attrListStr, outFile, outMap, oldMap = '') := MACRO
LOADXML('<xml/>');
#EXPORTXML(inFileFields, RECORDOF(inFile));
#UNIQUENAME(NomType);
%NomType% := UNSIGNED4;
#UNIQUENAME(MapRec);
%MapRec% := RECORD
STRING fieldName;
STRING valueAsString;
%NomType% nom;
END;
#UNIQUENAME(trimmedAttrList);
%trimmedAttrList% := TRIM((STRING)attrListStr, ALL);
#UNIQUENAME(CanProcessAttribute);
%CanProcessAttribute%(STRING attrName, STRING attrType) := (REGEXFIND('(^|,)' + attrName + '(,|$)', %trimmedAttrList%, NOCASE));
#UNIQUENAME(OutFileLayout);
%OutFileLayout% := RECORD
#FOR(inFileFields)
#FOR(field)
#IF(%{@isRecord}% = 1 OR %{@isDataset}% = 1 OR %{@isEnd}% = 1)
#ERROR('Datasets with embedded records or child datasets not supported')
#ELSE
#IF(%CanProcessAttribute%(%'@name'%, %'@type'%))
%NomType% %@name%;
#ELSE
%@type% %@name%;
#END
#END
#END
#END
END;
// Find maximum nominal values for each field for an old mapping
#UNIQUENAME(oldMapMaximums);
%oldMapMaximums% :=
#IF(#TEXT(oldMap) != '')
TABLE(oldMap, {fieldName, %NomType% nom := MAX(GROUP, nom)}, fieldName, FEW)
#ELSE
DATASET([], {STRING fieldName, %NomType% nom})
#END;
#UNIQUENAME(oldMapDict);
%oldMapDict% := DICTIONARY(%oldMapMaximums%, {fieldName => nom});
#UNIQUENAME(needsDelim);
#SET(needsDelim, 0);
#UNIQUENAME(corrNamePosX);
#UNIQUENAME(fieldX);
#SET(corrNamePosX, 1);
// Create nominal values for each unique value in our dataset
#UNIQUENAME(localMap);
%localMap% :=
#LOOP
#SET(fieldX, REGEXFIND('^([^,]+)', %trimmedAttrList%[%corrNamePosX%..], 1))
#IF(%'fieldX'% != '')
#IF(%needsDelim% = 1) + #END
PROJECT
(
TABLE(inFile, {%fieldX%}, %fieldX%, MERGE),
TRANSFORM
(
%MapRec%,
SELF.fieldName := %'fieldX'%,
SELF.valueAsString := (STRING)LEFT.%fieldX%,
SELF.nom := COUNTER + %oldMapDict%[%'fieldX'%].nom
)
)
#SET(needsDelim, 1)
#SET(corrNamePosX, %corrNamePosX% + LENGTH(%'fieldX'%) + 1)
#ELSE
#BREAK
#END
#END;
// Merge the old and new mappings if needed
outMap :=
#IF(#TEXT(oldMap) != '')
ROLLUP
(
SORT(%localMap% + oldMap, fieldName, valueAsString),
TRANSFORM
(
RECORDOF(LEFT),
SELF.nom := MIN(LEFT.nom, RIGHT.nom),
SELF := LEFT
),
fieldName, valueAsString
)
#ELSE
%localMap%
#END;
#UNIQUENAME(dict);
%dict% := DICTIONARY(outMap, {fieldName, valueAsString => nom});
// Rewrite data to use the mappings
outFile := PROJECT
(
inFile,
TRANSFORM
(
%OutFileLayout%,
#FOR(inFileFields)
#FOR(field)
#IF(%CanProcessAttribute%(%'@name'%, %'@type'%))
SELF.%@name% := %dict%[%'@name'%, (STRING)LEFT.%@name%].nom,
#END
#END
#END
SELF := LEFT
)
);
ENDMACRO;