Skip to content

Commit 00d8458

Browse files
committed
解决上次优化索引分词带来的问题:hankcs#513
1 parent d37f97c commit 00d8458

File tree

3 files changed

+34
-5
lines changed

3 files changed

+34
-5
lines changed

src/main/java/com/hankcs/hanlp/seg/Segment.java

+22-4
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList,
274274
vertexList.toArray(wordNet);
275275
// DAT合并
276276
DoubleArrayTrie<CoreDictionary.Attribute> dat = CustomDictionary.dat;
277-
for (int i = 0; i < wordNet.length; ++i)
277+
for (int i = 0, line = 0; i < wordNet.length; ++i)
278278
{
279279
int state = 1;
280280
state = dat.transition(wordNet[i].realWord, state);
@@ -297,15 +297,24 @@ protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList,
297297
if (value != null)
298298
{
299299
combineWords(wordNet, i, end, value);
300-
wordNetAll.add(i, wordNet[i]);
300+
wordNetAll.add(line, wordNet[i]);
301+
line += wordNet[i].realWord.length();
301302
i = end - 1;
302303
}
304+
else
305+
{
306+
line += wordNet[i].realWord.length();
307+
}
308+
}
309+
else
310+
{
311+
line += wordNet[i].realWord.length();
303312
}
304313
}
305314
// BinTrie合并
306315
if (CustomDictionary.trie != null)
307316
{
308-
for (int i = 0; i < wordNet.length; ++i)
317+
for (int i = 0, line = 0; i < wordNet.length; ++i)
309318
{
310319
if (wordNet[i] == null) continue;
311320
BaseNode<CoreDictionary.Attribute> state = CustomDictionary.trie.transition(wordNet[i].realWord.toCharArray(), 0);
@@ -328,9 +337,18 @@ protected static List<Vertex> combineByCustomDictionary(List<Vertex> vertexList,
328337
if (value != null)
329338
{
330339
combineWords(wordNet, i, end, value);
331-
wordNetAll.add(i, wordNet[i]);
340+
wordNetAll.add(line, wordNet[i]);
341+
line += wordNet[i].realWord.length();
332342
i = end - 1;
333343
}
344+
else
345+
{
346+
line += wordNet[i].realWord.length();
347+
}
348+
}
349+
else
350+
{
351+
line += wordNet[i].realWord.length();
334352
}
335353
}
336354
}

src/main/java/com/hankcs/hanlp/seg/WordBasedGenerativeModelSegment.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,9 @@ protected static List<Term> decorateResultForIndexMode(List<Vertex> vertexList,
494494
if (
495495
((termMain.nature == Nature.mq && smallVertex.hasNature(Nature.q)) ||
496496
smallVertex.realWord.length() > 1)
497-
&& smallVertex != vertex)
497+
&& smallVertex != vertex // 防止重复添加
498+
&& currentLine + smallVertex.realWord.length() <= line + vertex.realWord.length() // 防止超出边界
499+
)
498500
{
499501
listIterator.add(smallVertex);
500502
Term termSub = convert(smallVertex);

src/test/java/com/hankcs/test/seg/TestSegment.java

+9
Original file line numberDiff line numberDiff line change
@@ -406,4 +406,13 @@ public void testIssue496() throws Exception
406406
System.out.println(segment.seg("中医药"));
407407
System.out.println(segment.seg("中医药大学"));
408408
}
409+
410+
public void testIssue513() throws Exception
411+
{
412+
List<Term> termList = IndexTokenizer.segment("南京市长江大桥");
413+
for (Term term : termList)
414+
{
415+
System.out.println(term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]");
416+
}
417+
}
409418
}

0 commit comments

Comments
 (0)