Skip to content

Commit 7318710

Browse files
committed
支持98年人民日报的复合词语料格式,如"[中央/n 人民/n 广播/vn 电台/n]nt"
1 parent ff78339 commit 7318710

File tree

3 files changed

+9
-4
lines changed

3 files changed

+9
-4
lines changed

src/main/java/com/hankcs/hanlp/corpus/document/sentence/Sentence.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public String toString()
4949

5050
public static Sentence create(String param)
5151
{
52-
Pattern pattern = Pattern.compile("(\\[(([^\\s]+/[0-9a-zA-Z]+)\\s+)+?([^\\s]+/[0-9a-zA-Z]+)]/[0-9a-zA-Z]+)|([^\\s]+/[0-9a-zA-Z]+)");
52+
Pattern pattern = Pattern.compile("(\\[(([^\\s]+/[0-9a-zA-Z]+)\\s+)+?([^\\s]+/[0-9a-zA-Z]+)]/?[0-9a-zA-Z]+)|([^\\s]+/[0-9a-zA-Z]+)");
5353
Matcher matcher = pattern.matcher(param);
5454
List<IWord> wordList = new LinkedList<IWord>();
5555
while (matcher.find())

src/main/java/com/hankcs/hanlp/corpus/document/sentence/word/CompoundWord.java

+7-3
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,11 @@ public CompoundWord(List<Word> innerList, String label)
9696
public static CompoundWord create(String param)
9797
{
9898
if (param == null) return null;
99-
int cutIndex = param.lastIndexOf('/');
99+
int cutIndex = param.lastIndexOf(']');
100100
if (cutIndex <= 2 || cutIndex == param.length() - 1) return null;
101-
String wordParam = param.substring(1, cutIndex - 1);
101+
String wordParam = param.substring(1, cutIndex);
102102
List<Word> wordList = new LinkedList<Word>();
103-
for (String single : wordParam.split(" "))
103+
for (String single : wordParam.split("\\s+"))
104104
{
105105
if (single.length() == 0) continue;
106106
Word word = Word.create(single);
@@ -112,6 +112,10 @@ public static CompoundWord create(String param)
112112
wordList.add(word);
113113
}
114114
String labelParam = param.substring(cutIndex + 1);
115+
if (labelParam.startsWith("/"))
116+
{
117+
labelParam = labelParam.substring(1);
118+
}
115119
return new CompoundWord(wordList, labelParam);
116120
}
117121
}

src/test/java/com/hankcs/test/corpus/TestWord.java

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ public void testCreate() throws Exception
2424
{
2525
assertEquals("人民网/nz", Word.create("人民网/nz").toString());
2626
assertEquals("[纽约/nsf 时报/n]/nz", CompoundWord.create("[纽约/nsf 时报/n]/nz").toString());
27+
assertEquals("[中央/n 人民/n 广播/vn 电台/n]/nt", CompoundWord.create("[中央/n 人民/n 广播/vn 电台/n]nt").toString());
2728
}
2829

2930
public void testSpace() throws Exception

0 commit comments

Comments
 (0)