feat: 添加 separator 解析器及相关配置，支持严格分隔功能 Question: 知识库对excel切分无法按行切分

xerrors · xerrors · commit bba9bc974f64 · 2026-04-28T20:30:02.000+08:00
Fixes #664
diff --git a/backend/package/yuxi/knowledge/chunking/ragflow_like/dispatcher.py b/backend/package/yuxi/knowledge/chunking/ragflow_like/dispatcher.py
@@ -2,7 +2,7 @@
 
 from typing import Any
 
-from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa, semantic
+from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa, semantic, separator
 from yuxi.knowledge.chunking.ragflow_like.presets import map_to_internal_parser_id, normalize_chunk_preset_id
 
 
@@ -44,6 +44,8 @@ def _dispatch_markdown_parser(
         return laws.chunk_markdown(filename, markdown_content, parser_config)
     if parser_id == "semantic":
         return semantic.chunk_markdown(markdown_content, parser_config)
+    if parser_id == "separator":
+        return separator.chunk_markdown(markdown_content, parser_config)
 
     return general.chunk_markdown(markdown_content, parser_config)
 
diff --git a/backend/package/yuxi/knowledge/chunking/ragflow_like/parsers/__init__.py b/backend/package/yuxi/knowledge/chunking/ragflow_like/parsers/__init__.py
@@ -1,3 +1,3 @@
-from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa
+from yuxi.knowledge.chunking.ragflow_like.parsers import book, general, laws, qa, separator
 
-__all__ = ["general", "qa", "book", "laws"]
+__all__ = ["general", "qa", "book", "laws", "separator"]
diff --git a/backend/package/yuxi/knowledge/chunking/ragflow_like/parsers/separator.py b/backend/package/yuxi/knowledge/chunking/ragflow_like/parsers/separator.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Any
+
+from yuxi.knowledge.chunking.ragflow_like import nlp
+from yuxi.knowledge.chunking.ragflow_like.parsers.general import _iter_sections, _unescape_delimiter
+
+
+def _slice_text_by_tokens(text: str, max_tokens: int, overlap_tokens: int) -> list[str]:
+    if max_tokens <= 0:
+        return [text] if text.strip() else []
+
+    units = [part for part in text]
+    chunks: list[str] = []
+    start = 0
+
+    while start < len(units):
+        current = ""
+        current_tokens = 0
+        end = start
+
+        while end < len(units):
+            next_text = current + units[end]
+            next_tokens = nlp.count_tokens(next_text)
+            if current and next_tokens > max_tokens:
+                break
+            current = next_text
+            current_tokens = next_tokens
+            end += 1
+            if current_tokens >= max_tokens:
+                break
+
+        chunk = current.strip()
+        if chunk:
+            chunks.append(chunk)
+
+        if end >= len(units):
+            break
+
+        if overlap_tokens <= 0:
+            start = end
+            continue
+
+        backtrack = end
+        overlap_text = ""
+        while backtrack > start:
+            candidate = units[backtrack - 1] + overlap_text
+            if nlp.count_tokens(candidate) > overlap_tokens:
+                break
+            overlap_text = candidate
+            backtrack -= 1
+
+        start = backtrack if backtrack < end else end
+
+    return chunks
+
+
+def _split_section_with_overlap(section: str, chunk_token_num: int, overlapped_percent: int) -> list[str]:
+    overlap_tokens = 0
+    if chunk_token_num > 0 and overlapped_percent > 0:
+        overlap_tokens = int(chunk_token_num * max(0, min(overlapped_percent, 99)) / 100)
+    return _slice_text_by_tokens(section, chunk_token_num, overlap_tokens)
+
+
+def chunk_markdown(markdown_content: str, parser_config: dict[str, Any] | None = None) -> list[str]:
+    parser_config = parser_config or {}
+
+    delimiter = _unescape_delimiter(str(parser_config.get("delimiter", "\n") or "\n"))
+    chunk_token_num = int(parser_config.get("chunk_token_num", 512) or 512)
+    overlapped_percent = int(parser_config.get("overlapped_percent", 0) or 0)
+
+    sections = _iter_sections(markdown_content, delimiter)
+    chunks: list[str] = []
+
+    for section, _ in sections:
+        text = (section or "").strip()
+        if not text:
+            continue
+
+        if chunk_token_num > 0 and nlp.count_tokens(text) > chunk_token_num:
+            chunks.extend(_split_section_with_overlap(text, chunk_token_num, overlapped_percent))
+            continue
+
+        chunks.append(text)
+
+    return chunks
diff --git a/backend/package/yuxi/knowledge/chunking/ragflow_like/presets.py b/backend/package/yuxi/knowledge/chunking/ragflow_like/presets.py
@@ -10,13 +10,15 @@
 CHUNK_PRESET_BOOK = "book"
 CHUNK_PRESET_LAWS = "laws"
 CHUNK_PRESET_SEMANTIC = "semantic"
+CHUNK_PRESET_SEPARATOR = "separator"
 
 CHUNK_PRESET_IDS = {
     CHUNK_PRESET_GENERAL,
     CHUNK_PRESET_QA,
     CHUNK_PRESET_BOOK,
     CHUNK_PRESET_LAWS,
     CHUNK_PRESET_SEMANTIC,
+    CHUNK_PRESET_SEPARATOR,
 }
 
 CHUNK_PRESET_DESCRIPTIONS: dict[str, str] = {
@@ -25,6 +27,7 @@
     CHUNK_PRESET_BOOK: "书籍分块：强化章节标题识别并做层级合并，适合教材、手册、长章节文档。",
     CHUNK_PRESET_LAWS: "法规分块：按法条层级组织与合并，适合法律法规、制度规范类文本。",
     CHUNK_PRESET_SEMANTIC: "语义分块：利用嵌入和聚类算法进行语义切分，并自动增强标题上下文。",
+    CHUNK_PRESET_SEPARATOR: "严格分隔：命中分隔符即切分，仅超长片段内部继续按长度切分。",
 }
 
 CHUNK_ENGINE_VERSION = "ragflow_like_v1"
@@ -74,6 +77,17 @@
             "method": "light",
         },
     },
+    CHUNK_PRESET_SEPARATOR: {
+        "layout_recognize": "DeepDOC",
+        "chunk_token_num": 512,
+        "delimiter": "\n",
+        "auto_keywords": 0,
+        "auto_questions": 0,
+        "html4excel": False,
+        "topn_tags": 3,
+        "raptor": {"use_raptor": False},
+        "graphrag": {"use_graphrag": False},
+    },
 }
 
 
@@ -251,4 +265,9 @@ def get_chunk_preset_options() -> list[dict[str, str]]:
             "label": "Semantic",
             "description": CHUNK_PRESET_DESCRIPTIONS[CHUNK_PRESET_SEMANTIC],
         },
+        {
+            "value": CHUNK_PRESET_SEPARATOR,
+            "label": "Separator",
+            "description": CHUNK_PRESET_DESCRIPTIONS[CHUNK_PRESET_SEPARATOR],
+        },
     ]
diff --git a/web/src/utils/chunk_presets.js b/web/src/utils/chunk_presets.js
@@ -23,6 +23,11 @@ export const CHUNK_PRESET_OPTIONS = [
     value: 'semantic',
     label: 'Semantic',
     description: '语义分块：利用嵌入和聚类算法进行语义切分，并自动增强标题上下文。'
+  },
+  {
+    value: 'separator',
+    label: 'Separator',
+    description: '严格分隔：命中分隔符即切分，仅超长片段内部继续按长度切分。'
   }
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,11 @@ export const CHUNK_PRESET_OPTIONS = [`
`23`	`23`	`value: 'semantic',`
`24`	`24`	`label: 'Semantic',`
`25`	`25`	`description: '语义分块：利用嵌入和聚类算法进行语义切分，并自动增强标题上下文。'`
	`26`	`+ },`
	`27`	`+ {`
	`28`	`+ value: 'separator',`
	`29`	`+ label: 'Separator',`
	`30`	`+ description: '严格分隔：命中分隔符即切分，仅超长片段内部继续按长度切分。'`
`26`	`31`	`}`
`27`	`32`	`]`
`28`	`33`