Skip to content

Commit bba9bc9

Browse files
committed
feat: 添加 separator 解析器及相关配置,支持严格分隔功能 Question: 知识库对excel切分无法按行切分
Fixes #664
1 parent f4c5649 commit bba9bc9

5 files changed

Lines changed: 115 additions & 3 deletions

File tree

backend/package/yuxi/knowledge/chunking/ragflow_like/dispatcher.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from typing import Any
44

5-
from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa, semantic
5+
from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa, semantic, separator
66
from yuxi.knowledge.chunking.ragflow_like.presets import map_to_internal_parser_id, normalize_chunk_preset_id
77

88

@@ -44,6 +44,8 @@ def _dispatch_markdown_parser(
4444
return laws.chunk_markdown(filename, markdown_content, parser_config)
4545
if parser_id == "semantic":
4646
return semantic.chunk_markdown(markdown_content, parser_config)
47+
if parser_id == "separator":
48+
return separator.chunk_markdown(markdown_content, parser_config)
4749

4850
return general.chunk_markdown(markdown_content, parser_config)
4951

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa
1+
from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa, separator
22

3-
__all__ = ["general", "qa", "book", "laws"]
3+
__all__ = ["general", "qa", "book", "laws", "separator"]
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from __future__ import annotations
2+
3+
from typing import Any
4+
5+
from yuxi.knowledge.chunking.ragflow_like import nlp
6+
from yuxi.knowledge.chunking.ragflow_like.parsers.general import _iter_sections, _unescape_delimiter
7+
8+
9+
def _slice_text_by_tokens(text: str, max_tokens: int, overlap_tokens: int) -> list[str]:
10+
if max_tokens <= 0:
11+
return [text] if text.strip() else []
12+
13+
units = [part for part in text]
14+
chunks: list[str] = []
15+
start = 0
16+
17+
while start < len(units):
18+
current = ""
19+
current_tokens = 0
20+
end = start
21+
22+
while end < len(units):
23+
next_text = current + units[end]
24+
next_tokens = nlp.count_tokens(next_text)
25+
if current and next_tokens > max_tokens:
26+
break
27+
current = next_text
28+
current_tokens = next_tokens
29+
end += 1
30+
if current_tokens >= max_tokens:
31+
break
32+
33+
chunk = current.strip()
34+
if chunk:
35+
chunks.append(chunk)
36+
37+
if end >= len(units):
38+
break
39+
40+
if overlap_tokens <= 0:
41+
start = end
42+
continue
43+
44+
backtrack = end
45+
overlap_text = ""
46+
while backtrack > start:
47+
candidate = units[backtrack - 1] + overlap_text
48+
if nlp.count_tokens(candidate) > overlap_tokens:
49+
break
50+
overlap_text = candidate
51+
backtrack -= 1
52+
53+
start = backtrack if backtrack < end else end
54+
55+
return chunks
56+
57+
58+
def _split_section_with_overlap(section: str, chunk_token_num: int, overlapped_percent: int) -> list[str]:
59+
overlap_tokens = 0
60+
if chunk_token_num > 0 and overlapped_percent > 0:
61+
overlap_tokens = int(chunk_token_num * max(0, min(overlapped_percent, 99)) / 100)
62+
return _slice_text_by_tokens(section, chunk_token_num, overlap_tokens)
63+
64+
65+
def chunk_markdown(markdown_content: str, parser_config: dict[str, Any] | None = None) -> list[str]:
66+
parser_config = parser_config or {}
67+
68+
delimiter = _unescape_delimiter(str(parser_config.get("delimiter", "\n") or "\n"))
69+
chunk_token_num = int(parser_config.get("chunk_token_num", 512) or 512)
70+
overlapped_percent = int(parser_config.get("overlapped_percent", 0) or 0)
71+
72+
sections = _iter_sections(markdown_content, delimiter)
73+
chunks: list[str] = []
74+
75+
for section, _ in sections:
76+
text = (section or "").strip()
77+
if not text:
78+
continue
79+
80+
if chunk_token_num > 0 and nlp.count_tokens(text) > chunk_token_num:
81+
chunks.extend(_split_section_with_overlap(text, chunk_token_num, overlapped_percent))
82+
continue
83+
84+
chunks.append(text)
85+
86+
return chunks

backend/package/yuxi/knowledge/chunking/ragflow_like/presets.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@
1010
CHUNK_PRESET_BOOK = "book"
1111
CHUNK_PRESET_LAWS = "laws"
1212
CHUNK_PRESET_SEMANTIC = "semantic"
13+
CHUNK_PRESET_SEPARATOR = "separator"
1314

1415
CHUNK_PRESET_IDS = {
1516
CHUNK_PRESET_GENERAL,
1617
CHUNK_PRESET_QA,
1718
CHUNK_PRESET_BOOK,
1819
CHUNK_PRESET_LAWS,
1920
CHUNK_PRESET_SEMANTIC,
21+
CHUNK_PRESET_SEPARATOR,
2022
}
2123

2224
CHUNK_PRESET_DESCRIPTIONS: dict[str, str] = {
@@ -25,6 +27,7 @@
2527
CHUNK_PRESET_BOOK: "书籍分块:强化章节标题识别并做层级合并,适合教材、手册、长章节文档。",
2628
CHUNK_PRESET_LAWS: "法规分块:按法条层级组织与合并,适合法律法规、制度规范类文本。",
2729
CHUNK_PRESET_SEMANTIC: "语义分块:利用嵌入和聚类算法进行语义切分,并自动增强标题上下文。",
30+
CHUNK_PRESET_SEPARATOR: "严格分隔:命中分隔符即切分,仅超长片段内部继续按长度切分。",
2831
}
2932

3033
CHUNK_ENGINE_VERSION = "ragflow_like_v1"
@@ -74,6 +77,17 @@
7477
"method": "light",
7578
},
7679
},
80+
CHUNK_PRESET_SEPARATOR: {
81+
"layout_recognize": "DeepDOC",
82+
"chunk_token_num": 512,
83+
"delimiter": "\n",
84+
"auto_keywords": 0,
85+
"auto_questions": 0,
86+
"html4excel": False,
87+
"topn_tags": 3,
88+
"raptor": {"use_raptor": False},
89+
"graphrag": {"use_graphrag": False},
90+
},
7791
}
7892

7993

@@ -251,4 +265,9 @@ def get_chunk_preset_options() -> list[dict[str, str]]:
251265
"label": "Semantic",
252266
"description": CHUNK_PRESET_DESCRIPTIONS[CHUNK_PRESET_SEMANTIC],
253267
},
268+
{
269+
"value": CHUNK_PRESET_SEPARATOR,
270+
"label": "Separator",
271+
"description": CHUNK_PRESET_DESCRIPTIONS[CHUNK_PRESET_SEPARATOR],
272+
},
254273
]

web/src/utils/chunk_presets.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ export const CHUNK_PRESET_OPTIONS = [
2323
value: 'semantic',
2424
label: 'Semantic',
2525
description: '语义分块:利用嵌入和聚类算法进行语义切分,并自动增强标题上下文。'
26+
},
27+
{
28+
value: 'separator',
29+
label: 'Separator',
30+
description: '严格分隔:命中分隔符即切分,仅超长片段内部继续按长度切分。'
2631
}
2732
]
2833

0 commit comments

Comments
 (0)