Skip to content

Commit 17afa93

Browse files
author
Alex Amies
committed
Allow Hiragana and Katakana in the text segmenter
1 parent e88a1fc commit 17afa93

File tree

2 files changed

+41
-4
lines changed

2 files changed

+41
-4
lines changed

dicttypes/dicttypes.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// you may not use this file except in compliance with the License.
33
// You may obtain a copy of the License at
44
//
5-
// http://www.apache.org/licenses/LICENSE-2.0
5+
// http://www.apache.org/licenses/LICENSE-2.0
66
//
77
// Unless required by applicable law or agreed to in writing, software
88
// distributed under the License is distributed on an "AS IS" BASIS,
@@ -59,7 +59,7 @@ func CloneWord(w Word) Word {
5959
// Only looks at the first charater in the string
6060
func IsCJKChar(character string) bool {
6161
r := []rune(character)
62-
return unicode.Is(unicode.Han, r[0]) && !unicode.IsPunct(r[0])
62+
return (unicode.Is(unicode.Han, r[0]) || unicode.Is(unicode.Hiragana, r[0]) || unicode.Is(unicode.Katakana, r[0])) && !unicode.IsPunct(r[0])
6363
}
6464

6565
// Tests whether the word is a function word

dicttypes/dicttypes_test.go

+39-2
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func makeHW2() Word {
5151
}
5252
}
5353

54-
// TestAddWordSense2Map does a query expecting empty list
54+
// TestCloneWord does a query expecting empty list
5555
func TestCloneWord(t *testing.T) {
5656
w1 := Word{
5757
Simplified: "你好",
@@ -65,7 +65,7 @@ func TestCloneWord(t *testing.T) {
6565
}
6666
}
6767

68-
// TestAddWordSense2Map does a query expecting empty list
68+
// TestIsProperNoun does a query expecting empty list
6969
func TestIsProperNoun(t *testing.T) {
7070
s := WordSense{
7171
Simplified: "王",
@@ -273,3 +273,40 @@ func TestIsQuote(t *testing.T) {
273273
}
274274
}
275275
}
276+
277+
// TestIsCJKChar tests IsCJKChar
278+
func TestIsCJKChar(t *testing.T) {
279+
type test struct {
280+
name string
281+
input string
282+
expect bool
283+
}
284+
tests := []test{
285+
{
286+
name: "Chinese",
287+
input: "中国",
288+
expect: true,
289+
},
290+
{
291+
name: "English",
292+
input: "USA",
293+
expect: false,
294+
},
295+
{
296+
name: "Hiragana",
297+
input: "あ",
298+
expect: true,
299+
},
300+
{
301+
name: "Katakana",
302+
input: "ア",
303+
expect: true,
304+
},
305+
}
306+
for _, tc := range tests {
307+
got := IsCJKChar(tc.input)
308+
if got != tc.expect {
309+
t.Errorf("%s: IsCJKChar(%s) got %t but expected %t ", tc.name, tc.input, got, tc.expect)
310+
}
311+
}
312+
}

0 commit comments

Comments
 (0)