Skip to content

Commit 070e178

Browse files
committed
简繁转换模式匹配升级到AhoCorasickDoubleArrayTrie,性能大幅提升
1 parent 9c4f0fd commit 070e178

File tree

6 files changed

+56
-21
lines changed

6 files changed

+56
-21
lines changed

LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Apache License
1313
the copyright owner that is granting the License.
1414

1515
"Legal Entity" shall mean the union of the acting entity and all
16-
other entities that control, are controlled by, or are under common
16+
other entities that control, are controlled by, or are under com.hankcs.common
1717
control with that entity. For the purposes of this definition,
1818
"control" means (i) the power, direct or indirect, to cause the
1919
direction or management of such entity, whether by contract or

src/main/java/com/hankcs/hanlp/algoritm/LongestCommonSubstring.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public static int compute(char[] str1, char[] str2)
2727
// the start position of substring in original string
2828
// int start1 = -1;
2929
// int start2 = -1;
30-
// the longest length of common substring
30+
// the longest length of com.hankcs.common substring
3131
int longest = 0;
3232

3333
// record how many comparisons the solution did;
@@ -62,7 +62,7 @@ public static int compute(char[] str1, char[] str2)
6262
}
6363
}
6464

65-
// shift string2 to find the longest common substring
65+
// shift string2 to find the longest com.hankcs.common substring
6666
for (int j = 1; j < size2; ++j)
6767
{
6868
int m = 0;

src/main/java/com/hankcs/hanlp/dictionary/ts/BaseChineseDictionary.java

+48-15
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,17 @@
1111
*/
1212
package com.hankcs.hanlp.dictionary.ts;
1313

14+
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
1415
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
1516
import com.hankcs.hanlp.corpus.dictionary.StringDictionary;
1617
import com.hankcs.hanlp.corpus.io.ByteArray;
1718
import com.hankcs.hanlp.dictionary.BaseSearcher;
19+
import com.hankcs.hanlp.dictionary.py.Pinyin;
1820
import com.hankcs.hanlp.utility.Predefine;
1921

2022
import java.io.DataOutputStream;
2123
import java.io.FileOutputStream;
22-
import java.util.LinkedList;
23-
import java.util.Map;
24-
import java.util.Set;
24+
import java.util.*;
2525

2626
import static com.hankcs.hanlp.utility.Predefine.logger;
2727

@@ -36,7 +36,7 @@ public class BaseChineseDictionary
3636
* @param trie
3737
* @return
3838
*/
39-
static boolean load(String path, DoubleArrayTrie<String> trie)
39+
static boolean load(String path, AhoCorasickDoubleArrayTrie<String> trie)
4040
{
4141
return load(path, trie, false);
4242
}
@@ -48,7 +48,7 @@ static boolean load(String path, DoubleArrayTrie<String> trie)
4848
* @param reverse 是否将其翻转
4949
* @return
5050
*/
51-
static boolean load(String path, DoubleArrayTrie<String> trie, boolean reverse)
51+
static boolean load(String path, AhoCorasickDoubleArrayTrie<String> trie, boolean reverse)
5252
{
5353
String datPath = path;
5454
if (reverse)
@@ -61,37 +61,37 @@ static boolean load(String path, DoubleArrayTrie<String> trie, boolean reverse)
6161
if (!dictionary.load(path)) return false;
6262
if (reverse) dictionary = dictionary.reverse();
6363
Set<Map.Entry<String, String>> entrySet = dictionary.entrySet();
64-
int resultCode = trie.build(entrySet);
65-
if (resultCode < 0)
64+
TreeMap<String, String> map = new TreeMap<>();
65+
for (Map.Entry<String, String> entry : entrySet)
6666
{
67-
logger.warning(path + "构建DAT失败,错误码:" + resultCode);
68-
return false;
67+
map.put(entry.getKey(), entry.getValue());
6968
}
69+
logger.info("正在构建AhoCorasickDoubleArrayTrie,来源:" + path);
70+
trie.build(map);
7071
logger.info("正在缓存双数组" + datPath);
7172
saveDat(datPath, trie, entrySet);
7273
return true;
7374
}
7475

75-
static boolean loadDat(String path, DoubleArrayTrie<String> trie)
76+
static boolean loadDat(String path, AhoCorasickDoubleArrayTrie<String> trie)
7677
{
77-
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.VALUE_EXT);
78+
ByteArray byteArray = ByteArray.createByteArray(path + Predefine.BIN_EXT);
7879
if (byteArray == null) return false;
7980
int size = byteArray.nextInt();
8081
String[] valueArray = new String[size];
8182
for (int i = 0; i < valueArray.length; ++i)
8283
{
8384
valueArray[i] = byteArray.nextString();
8485
}
85-
if (!trie.load(path + Predefine.TRIE_EXT, valueArray)) return false;
86+
trie.load(byteArray, valueArray);
8687
return true;
8788
}
8889

89-
static boolean saveDat(String path, DoubleArrayTrie<String> trie, Set<Map.Entry<String, String>> entrySet)
90+
static boolean saveDat(String path, AhoCorasickDoubleArrayTrie<String> trie, Set<Map.Entry<String, String>> entrySet)
9091
{
91-
if (!trie.save(path + Predefine.TRIE_EXT)) return false;
9292
try
9393
{
94-
DataOutputStream out = new DataOutputStream(new FileOutputStream(path + Predefine.VALUE_EXT));
94+
DataOutputStream out = new DataOutputStream(new FileOutputStream(path + Predefine.BIN_EXT));
9595
out.writeInt(entrySet.size());
9696
for (Map.Entry<String, String> entry : entrySet)
9797
{
@@ -102,6 +102,7 @@ static boolean saveDat(String path, DoubleArrayTrie<String> trie, Set<Map.Entry<
102102
out.writeChar(c);
103103
}
104104
}
105+
trie.save(out);
105106
out.close();
106107
}
107108
catch (Exception e)
@@ -146,6 +147,38 @@ protected static String segLongest(char[] charArray, DoubleArrayTrie<String> tri
146147
return sb.toString();
147148
}
148149

150+
protected static String segLongest(char[] charArray, AhoCorasickDoubleArrayTrie<String> trie)
151+
{
152+
final String[] wordNet = new String[charArray.length];
153+
final int[] lengthNet = new int[charArray.length];
154+
trie.parseText(charArray, new AhoCorasickDoubleArrayTrie.IHit<String>()
155+
{
156+
@Override
157+
public void hit(int begin, int end, String value)
158+
{
159+
int length = end - begin;
160+
if (length > lengthNet[begin])
161+
{
162+
wordNet[begin] = value;
163+
lengthNet[begin] = length;
164+
}
165+
}
166+
});
167+
StringBuilder sb = new StringBuilder(charArray.length);
168+
for (int offset = 0; offset < wordNet.length; )
169+
{
170+
if (wordNet[offset] == null)
171+
{
172+
sb.append(charArray[offset]);
173+
++offset;
174+
continue;
175+
}
176+
sb.append(wordNet[offset]);
177+
offset += lengthNet[offset];
178+
}
179+
return sb.toString();
180+
}
181+
149182
/**
150183
* 最长分词
151184
*/

src/main/java/com/hankcs/hanlp/dictionary/ts/SimplifiedChineseDictionary.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
package com.hankcs.hanlp.dictionary.ts;
1313

1414
import com.hankcs.hanlp.HanLP;
15+
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
1516
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
1617
import com.hankcs.hanlp.utility.Predefine;
1718

@@ -26,7 +27,7 @@ public class SimplifiedChineseDictionary extends BaseChineseDictionary
2627
/**
2728
* 简体=繁体
2829
*/
29-
static DoubleArrayTrie<String> trie = new DoubleArrayTrie<>();
30+
static AhoCorasickDoubleArrayTrie<String> trie = new AhoCorasickDoubleArrayTrie<>();
3031

3132
static
3233
{

src/main/java/com/hankcs/hanlp/dictionary/ts/TraditionalChineseDictionary.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
package com.hankcs.hanlp.dictionary.ts;
1313

1414
import com.hankcs.hanlp.HanLP;
15+
import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
1516
import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
1617

1718
import static com.hankcs.hanlp.utility.Predefine.logger;
@@ -25,7 +26,7 @@ public class TraditionalChineseDictionary extends BaseChineseDictionary
2526
/**
2627
* 繁体=简体
2728
*/
28-
static DoubleArrayTrie<String> trie = new DoubleArrayTrie<>();
29+
static AhoCorasickDoubleArrayTrie<String> trie = new AhoCorasickDoubleArrayTrie<>();
2930

3031
static
3132
{

src/test/java/com/hankcs/demo/DemoTraditionalChinese2SimplifiedChinese.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public class DemoTraditionalChinese2SimplifiedChinese
2121
{
2222
public static void main(String[] args)
2323
{
24-
System.out.println(HanLP.convertToTraditionalChinese("用笔记本电脑写程序"));
24+
System.out.println(HanLP.convertToTraditionalChinese("用笔记本电脑写程序HelloWorld"));
2525
System.out.println(HanLP.convertToSimplifiedChinese("「以後等妳當上皇后,就能買士多啤梨慶祝了」"));
2626
}
2727
}

0 commit comments

Comments
 (0)