11
11
*/
12
12
package com .hankcs .hanlp .dictionary .ts ;
13
13
14
+ import com .hankcs .hanlp .collection .AhoCorasick .AhoCorasickDoubleArrayTrie ;
14
15
import com .hankcs .hanlp .collection .trie .DoubleArrayTrie ;
15
16
import com .hankcs .hanlp .corpus .dictionary .StringDictionary ;
16
17
import com .hankcs .hanlp .corpus .io .ByteArray ;
17
18
import com .hankcs .hanlp .dictionary .BaseSearcher ;
19
+ import com .hankcs .hanlp .dictionary .py .Pinyin ;
18
20
import com .hankcs .hanlp .utility .Predefine ;
19
21
20
22
import java .io .DataOutputStream ;
21
23
import java .io .FileOutputStream ;
22
- import java .util .LinkedList ;
23
- import java .util .Map ;
24
- import java .util .Set ;
24
+ import java .util .*;
25
25
26
26
import static com .hankcs .hanlp .utility .Predefine .logger ;
27
27
@@ -36,7 +36,7 @@ public class BaseChineseDictionary
36
36
* @param trie
37
37
* @return
38
38
*/
39
- static boolean load (String path , DoubleArrayTrie <String > trie )
39
+ static boolean load (String path , AhoCorasickDoubleArrayTrie <String > trie )
40
40
{
41
41
return load (path , trie , false );
42
42
}
@@ -48,7 +48,7 @@ static boolean load(String path, DoubleArrayTrie<String> trie)
48
48
* @param reverse 是否将其翻转
49
49
* @return
50
50
*/
51
- static boolean load (String path , DoubleArrayTrie <String > trie , boolean reverse )
51
+ static boolean load (String path , AhoCorasickDoubleArrayTrie <String > trie , boolean reverse )
52
52
{
53
53
String datPath = path ;
54
54
if (reverse )
@@ -61,37 +61,37 @@ static boolean load(String path, DoubleArrayTrie<String> trie, boolean reverse)
61
61
if (!dictionary .load (path )) return false ;
62
62
if (reverse ) dictionary = dictionary .reverse ();
63
63
Set <Map .Entry <String , String >> entrySet = dictionary .entrySet ();
64
- int resultCode = trie . build ( entrySet );
65
- if ( resultCode < 0 )
64
+ TreeMap < String , String > map = new TreeMap <>( );
65
+ for ( Map . Entry < String , String > entry : entrySet )
66
66
{
67
- logger .warning (path + "构建DAT失败,错误码:" + resultCode );
68
- return false ;
67
+ map .put (entry .getKey (), entry .getValue ());
69
68
}
69
+ logger .info ("正在构建AhoCorasickDoubleArrayTrie,来源:" + path );
70
+ trie .build (map );
70
71
logger .info ("正在缓存双数组" + datPath );
71
72
saveDat (datPath , trie , entrySet );
72
73
return true ;
73
74
}
74
75
75
- static boolean loadDat (String path , DoubleArrayTrie <String > trie )
76
+ static boolean loadDat (String path , AhoCorasickDoubleArrayTrie <String > trie )
76
77
{
77
- ByteArray byteArray = ByteArray .createByteArray (path + Predefine .VALUE_EXT );
78
+ ByteArray byteArray = ByteArray .createByteArray (path + Predefine .BIN_EXT );
78
79
if (byteArray == null ) return false ;
79
80
int size = byteArray .nextInt ();
80
81
String [] valueArray = new String [size ];
81
82
for (int i = 0 ; i < valueArray .length ; ++i )
82
83
{
83
84
valueArray [i ] = byteArray .nextString ();
84
85
}
85
- if (! trie .load (path + Predefine . TRIE_EXT , valueArray )) return false ;
86
+ trie .load (byteArray , valueArray );
86
87
return true ;
87
88
}
88
89
89
- static boolean saveDat (String path , DoubleArrayTrie <String > trie , Set <Map .Entry <String , String >> entrySet )
90
+ static boolean saveDat (String path , AhoCorasickDoubleArrayTrie <String > trie , Set <Map .Entry <String , String >> entrySet )
90
91
{
91
- if (!trie .save (path + Predefine .TRIE_EXT )) return false ;
92
92
try
93
93
{
94
- DataOutputStream out = new DataOutputStream (new FileOutputStream (path + Predefine .VALUE_EXT ));
94
+ DataOutputStream out = new DataOutputStream (new FileOutputStream (path + Predefine .BIN_EXT ));
95
95
out .writeInt (entrySet .size ());
96
96
for (Map .Entry <String , String > entry : entrySet )
97
97
{
@@ -102,6 +102,7 @@ static boolean saveDat(String path, DoubleArrayTrie<String> trie, Set<Map.Entry<
102
102
out .writeChar (c );
103
103
}
104
104
}
105
+ trie .save (out );
105
106
out .close ();
106
107
}
107
108
catch (Exception e )
@@ -146,6 +147,38 @@ protected static String segLongest(char[] charArray, DoubleArrayTrie<String> tri
146
147
return sb .toString ();
147
148
}
148
149
150
+ protected static String segLongest (char [] charArray , AhoCorasickDoubleArrayTrie <String > trie )
151
+ {
152
+ final String [] wordNet = new String [charArray .length ];
153
+ final int [] lengthNet = new int [charArray .length ];
154
+ trie .parseText (charArray , new AhoCorasickDoubleArrayTrie .IHit <String >()
155
+ {
156
+ @ Override
157
+ public void hit (int begin , int end , String value )
158
+ {
159
+ int length = end - begin ;
160
+ if (length > lengthNet [begin ])
161
+ {
162
+ wordNet [begin ] = value ;
163
+ lengthNet [begin ] = length ;
164
+ }
165
+ }
166
+ });
167
+ StringBuilder sb = new StringBuilder (charArray .length );
168
+ for (int offset = 0 ; offset < wordNet .length ; )
169
+ {
170
+ if (wordNet [offset ] == null )
171
+ {
172
+ sb .append (charArray [offset ]);
173
+ ++offset ;
174
+ continue ;
175
+ }
176
+ sb .append (wordNet [offset ]);
177
+ offset += lengthNet [offset ];
178
+ }
179
+ return sb .toString ();
180
+ }
181
+
149
182
/**
150
183
* 最长分词
151
184
*/
0 commit comments