-
Notifications
You must be signed in to change notification settings - Fork 5
/
IsUTF8.ecl
141 lines (137 loc) · 4.8 KB
/
IsUTF8.ecl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/**
* Function for determining if a string contains UTF-8 characters or not.
* Note that if you are processing an ECL STRING value then this function
* will always return FALSE because the value has already been converted
* to an ECL STRING (high ASCII, at best). For an accurate assessment,
* the argument should be cast as a DATA value.
*
* @param str The string to check, coerced as DATA; REQUIRED
* @param validate If TRUE, check/validate the entire string; if FALSE,
* abort scan at first valid UTF-8 character found and
* return TRUE; OPTIONAL, defaults to TRUE
*
* @return If validate argument is TRUE, the value of str is scanned in its
* entirety and TRUE is returned only if at least one UTF-8 character
* is found and the entire string is correctly encoded; if validate
* is FALSE, the function stops scanning when the first valid UTF-8
* character is found. If the value of str is found to be invalid
* then FALSE will be returned.
*
* If an empty string is passed to the function, TRUE will be returned
* if validate is FALSE, and FALSE will be returned if validate is TRUE.
*
* Origin: https://github.com/hpccsystems-solutions-lab/Useful_ECL
*/
EXPORT BOOLEAN IsUTF8(DATA str, BOOLEAN validate = TRUE) := EMBED(C++)
#option pure;
if (lenStr == 0)
return !validate;
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str);
const unsigned char* endPtr = bytes + lenStr;
bool foundAnyUTF8 = false;
while (bytes < endPtr)
{
if (bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D || (0x20 <= bytes[0] && bytes[0] <= 0x7E))
{
// ASCII; continue scan
bytes += 1;
}
else if ((0xC2 <= bytes[0] && bytes[0] <= 0xDF) && (bytes+1 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0xBF))
{
// Valid non-overlong 2-byte
if (validate)
{
bytes += 2;
foundAnyUTF8 = true;
}
else
{
return true;
}
}
else if (bytes[0] == 0xE0 && (bytes+2 < endPtr) && (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
{
// Valid excluding overlongs
if (validate)
{
bytes += 3;
foundAnyUTF8 = true;
}
else
{
return true;
}
}
else if (((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || bytes[0] == 0xEE || bytes[0] == 0xEF) && (bytes+2 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
{
// Valid straight 3-byte
if (validate)
{
bytes += 3;
foundAnyUTF8 = true;
}
else
{
return true;
}
}
else if (bytes[0] == 0xED && (bytes+2 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0x9F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
{
// Valid excluding surrogates
if (validate)
{
bytes += 3;
foundAnyUTF8 = true;
}
else
{
return true;
}
}
else if (bytes[0] == 0xF0 && (bytes+3 < endPtr) && (0x90 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
{
// Valid planes 1-3
if (validate)
{
bytes += 4;
foundAnyUTF8 = true;
}
else
{
return true;
}
}
else if ((0xF1 <= bytes[0] && bytes[0] <= 0xF3) && (bytes+3 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
{
// Valid planes 4-15
if (validate)
{
bytes += 4;
foundAnyUTF8 = true;
}
else
{
return true;
}
}
else if (bytes[0] == 0xF4 && (bytes+3 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0x8F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
{
// Valid plane 16
if (validate)
{
bytes += 4;
foundAnyUTF8 = true;
}
else
{
return true;
}
}
else
{
// Invalid; abort
return false;
}
}
return foundAnyUTF8;
ENDEMBED;