This repository has been archived by the owner on Apr 4, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
string.go
146 lines (131 loc) · 4.55 KB
/
string.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
package input
import (
"golang.org/x/text/unicode/norm"
"strings"
)
var (
// Unicode ditch table.
unicodeWhitespaceDitch = map[rune]struct{}{
'\r': struct{}{},
'\u200B': struct{}{}, // Zero width space.
'\u200C': struct{}{}, // Zero width non-joiner.
'\u200D': struct{}{}, // Zero width joiner.
'\u2060': struct{}{}, // Word joiner.
'\uFEFF': struct{}{}, // Zero width no-break.
}
// Whitespace replacement character map.
unicodeWhitespaceRepl = map[rune]struct{}{
'\u0009': struct{}{}, // Character tabulation (HT.)
'\u00A0': struct{}{}, // No-break space.
'\u180E': struct{}{}, // Mongolian vowel separator.
'\u2000': struct{}{}, // En quad.
'\u2001': struct{}{}, // Em quad.
'\u2002': struct{}{}, // En space.
'\u2003': struct{}{}, // Em space.
'\u2004': struct{}{}, // Three-per-em space.
'\u2005': struct{}{}, // Four-per-em space.
'\u2006': struct{}{}, // Six-per-em space.
'\u2007': struct{}{}, // Figure space.
'\u2008': struct{}{}, // Punctuation space.
'\u2009': struct{}{}, // Thin space.
'\u200A': struct{}{}, // Hair space.
'\u2028': struct{}{}, // Line separator.
'\u2029': struct{}{}, // Paragraph separator.
'\u202F': struct{}{}, // Narrow no-break space.
'\u205F': struct{}{}, // Medium mathemtical space.
'\u3000': struct{}{}, // Ideographic space.
}
// Whitespace line replacement character map.
unicodeWhitespaceLineRepl = map[rune]struct{}{
'\u0009': struct{}{}, // Character tabulation (HT.)
'\u000A': struct{}{}, // Line feed.
'\u0020': struct{}{}, // Space.
'\u00A0': struct{}{}, // No-break space.
'\u180E': struct{}{}, // Mongolian vowel separator.
'\u2000': struct{}{}, // En quad.
'\u2001': struct{}{}, // Em quad.
'\u2002': struct{}{}, // En space.
'\u2003': struct{}{}, // Em space.
'\u2004': struct{}{}, // Three-per-em space.
'\u2005': struct{}{}, // Four-per-em space.
'\u2006': struct{}{}, // Six-per-em space.
'\u2007': struct{}{}, // Figure space.
'\u2008': struct{}{}, // Punctuation space.
'\u2009': struct{}{}, // Thin space.
'\u200A': struct{}{}, // Hair space.
'\u2028': struct{}{}, // Line separator.
'\u2029': struct{}{}, // Paragraph separator.
'\u202F': struct{}{}, // Narrow no-break space.
'\u205F': struct{}{}, // Medium mathemtical space.
'\u3000': struct{}{}, // Ideographic space.
}
)
// Normalize string.
//
// Normalizes the string to NFC and replaces characters in the provided Unicode
// whitespace character table with regular spaces, and ditches any character
// in the ditch table.
func normalizeString(in string, spaceTbl, ditchTbl map[rune]struct{}) string {
result := make([]rune, 0, len(in))
for _, r := range norm.NFC.String(in) {
if r == '\u000B' || r == '\u000C' {
// Translate form feed and line tabulation.
r = '\n'
}
if _, ws := spaceTbl[r]; ws {
result = append(result, ' ')
} else if _, ditch := ditchTbl[r]; !ditch {
result = append(result, r)
}
}
return string(result)
}
// Normalize string.
//
// Normalizes the string to NFC and ditches any zero-width whitespace.
func NormalizeString(in string) string {
return normalizeString(in, map[rune]struct{}{}, unicodeWhitespaceDitch)
}
// Whitespace normalize string.
//
// Normalizes the string to NFC and replaces all non-line-breaking Unicode
// whitespace characters with regular spaces, and ditches any carriage returns.
func WhitespaceNormalizeString(in string) string {
return normalizeString(in, unicodeWhitespaceRepl, unicodeWhitespaceDitch)
}
// Whitespace normalize line.
//
// Normalizes the string to NFC and replaces all Unicode whitespace characters
// with regular spaces.
func WhitespaceNormalizeLine(in string) string {
return normalizeString(in, unicodeWhitespaceLineRepl, unicodeWhitespaceDitch)
}
// Trim whitespace normalize line.
//
// Normalizes the string to NFC and replaces all Unicode whitespace characters
// with regular spaces, trimming the final result.
func TrimWhitespaceNormalizeLine(in string) string {
return strings.TrimSpace(WhitespaceNormalizeLine(in))
}
// Trim single-whitespace normalize line.
//
// Normalizes the string to NFC and replaces all Unicode whitespace characters
// with regular spaces, trimming the final result. Multiple whitespaces are
// combined into one whitespace.
func TrimSingleWhitespaceNormalizeLine(in string) string {
norm := TrimWhitespaceNormalizeLine(in)
result := make([]rune, 0, len(norm))
lastWs := false
for _, r := range norm {
if r == ' ' {
if !lastWs {
result = append(result, r)
lastWs = true
}
} else {
lastWs = false
result = append(result, r)
}
}
return string(result)
}