volcengine · qin-ctx · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py
@@ -530,6 +530,7 @@ async def _overview_task(self, dir_uri: str) -> None:
                         dir_uri, file_summaries, children_abstracts
                     )
                 abstract = self._processor._extract_abstract_from_overview(overview)
+                overview, abstract = self._processor._enforce_size_limits(overview, abstract)
 
             try:
                 await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx)

diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py
@@ -355,6 +355,7 @@ async def _process_memory_directory(self, msg: SemanticMsg) -> None:
 
         overview = await self._generate_overview(dir_uri, file_summaries, [])
         abstract = self._extract_abstract_from_overview(overview)
+        overview, abstract = self._enforce_size_limits(overview, abstract)
 
         try:
             await viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=ctx)
@@ -577,8 +578,8 @@ async def _generate_text_summary(
                 logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}")
                 return {"name": file_name, "summary": ""}
 
-        # Limit content length (about 10000 tokens)
-        max_chars = 30000
+        # Limit content length
+        max_chars = get_openviking_config().semantic.max_file_content_chars
         if len(content) > max_chars:
             content = content[:max_chars] + "\n...(truncated)"
 
@@ -686,6 +687,15 @@ def _extract_abstract_from_overview(self, overview_content: str) -> str:
 
         return "\n".join(content_lines).strip()
 
+    def _enforce_size_limits(self, overview: str, abstract: str) -> Tuple[str, str]:
+        """Enforce max size limits on overview and abstract."""
+        semantic = get_openviking_config().semantic
+        if len(overview) > semantic.overview_max_chars:
+            overview = overview[: semantic.overview_max_chars]
+        if len(abstract) > semantic.abstract_max_chars:
+            abstract = abstract[: semantic.abstract_max_chars - 3] + "..."
+        return overview, abstract
+
     def _parse_overview_md(self, overview_content: str) -> Dict[str, str]:
         """Parse overview.md and extract file summaries.
 
@@ -747,6 +757,11 @@ async def _generate_overview(
     ) -> str:
         """Generate directory's .overview.md (L1).
 
+        For small directories, generates a single overview from all file summaries.
+        For large directories that would exceed the prompt budget, splits file
+        summaries into batches, generates a partial overview per batch, then
+        merges the partials into a final overview.
+
         Args:
             dir_uri: Directory URI
             file_summaries: File summary list
@@ -755,9 +770,10 @@ async def _generate_overview(
         Returns:
             Overview content
         """
-        import re
 
-        vlm = get_openviking_config().vlm
+        config = get_openviking_config()
+        vlm = config.vlm
+        semantic = config.semantic
 
         if not vlm.is_available():
             logger.warning("VLM not available, using default overview")
@@ -778,7 +794,64 @@ async def _generate_overview(
             else "None"
         )
 
-        # Generate overview
+        # Budget guard: check if prompt would be oversized
+        estimated_size = len(file_summaries_str) + len(children_abstracts_str)
+        over_budget = estimated_size > semantic.max_overview_prompt_chars
+        many_files = len(file_summaries) > semantic.overview_batch_size
+
+        if over_budget and many_files:
+            # Many files, oversized prompt → batch and merge
+            logger.info(
+                f"Overview prompt for {dir_uri} exceeds budget "
+                f"({estimated_size} chars, {len(file_summaries)} files). "
+                f"Splitting into batches of {semantic.overview_batch_size}."
+            )
+            overview = await self._batched_generate_overview(
+                dir_uri, file_summaries, children_abstracts, file_index_map
+            )
+        elif over_budget:
+            # Few files but long summaries → truncate summaries to fit budget
+            logger.info(
+                f"Overview prompt for {dir_uri} exceeds budget "
+                f"({estimated_size} chars) with {len(file_summaries)} files. "
+                f"Truncating summaries to fit."
+            )
+            budget = semantic.max_overview_prompt_chars
+            budget -= len(children_abstracts_str)
+            per_file = max(100, budget // max(len(file_summaries), 1))
+            truncated_lines = []
+            for idx, item in enumerate(file_summaries, 1):
+                summary = item["summary"][:per_file]
+                truncated_lines.append(f"[{idx}] {item['name']}: {summary}")
+            file_summaries_str = "\n".join(truncated_lines)
+            overview = await self._single_generate_overview(
+                dir_uri,
+                file_summaries_str,
+                children_abstracts_str,
+                file_index_map,
+            )
+        else:
+            overview = await self._single_generate_overview(
+                dir_uri,
+                file_summaries_str,
+                children_abstracts_str,
+                file_index_map,
+            )
+
+        return overview
+
+    async def _single_generate_overview(
+        self,
+        dir_uri: str,
+        file_summaries_str: str,
+        children_abstracts_str: str,
+        file_index_map: Dict[int, str],
+    ) -> str:
+        """Generate overview from a single prompt (small directories)."""
+        import re
+
+        vlm = get_openviking_config().vlm
+
         try:
             prompt = render_prompt(
                 "semantic.overview_generation",
@@ -801,9 +874,115 @@ def replace_index(match):
             return overview.strip()
 
         except Exception as e:
-            logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True)
+            logger.error(
+                f"Failed to generate overview for {dir_uri}: {e}",
+                exc_info=True,
+            )
             return f"# {dir_uri.split('/')[-1]}\n\nDirectory overview"
 
+    async def _batched_generate_overview(
+        self,
+        dir_uri: str,
+        file_summaries: List[Dict[str, str]],
+        children_abstracts: List[Dict[str, str]],
+        file_index_map: Dict[int, str],
+    ) -> str:
+        """Generate overview by batching file summaries and merging.
+
+        Splits file summaries into batches, generates a partial overview per
+        batch, then merges all partials into a final overview.
+        """
+        import re
+
+        vlm = get_openviking_config().vlm
+        semantic = get_openviking_config().semantic
+        batch_size = semantic.overview_batch_size
+        dir_name = dir_uri.split("/")[-1]
+
+        # Split file summaries into batches
+        batches = [
+            file_summaries[i : i + batch_size] for i in range(0, len(file_summaries), batch_size)
+        ]
+        logger.info(f"Generating overview for {dir_uri} in {len(batches)} batches")
+
+        # Build children abstracts string (used in first batch + merge)
+        children_abstracts_str = (
+            "\n".join(f"- {item['name']}/: {item['abstract']}" for item in children_abstracts)
+            if children_abstracts
+            else "None"
+        )
+
+        # Generate partial overview per batch using global file indices
+        partial_overviews = []
+        global_offset = 0
+        for batch_idx, batch in enumerate(batches):
+            # Build per-batch index map using global offsets
+            batch_lines = []
+            batch_index_map = {}
+            for local_idx, item in enumerate(batch):
+                global_idx = global_offset + local_idx + 1
+                batch_index_map[global_idx] = item["name"]
+                batch_lines.append(f"[{global_idx}] {item['name']}: {item['summary']}")
+            batch_str = "\n".join(batch_lines)
+            global_offset += len(batch)
+
+            # Include children abstracts in the first batch
+            children_str = children_abstracts_str if batch_idx == 0 else "None"
+
+            try:
+                prompt = render_prompt(
+                    "semantic.overview_generation",
+                    {
+                        "dir_name": dir_name,
+                        "file_summaries": batch_str,
+                        "children_abstracts": children_str,
+                    },
+                )
+                partial = await vlm.get_completion_async(prompt)
+
+                # Replace [number] references per batch using batch-local map
+                def make_replacer(idx_map):
+                    def replacer(match):
+                        idx = int(match.group(1))
+                        return idx_map.get(idx, match.group(0))
+
+                    return replacer
+
+                partial = re.sub(r"\[(\d+)\]", make_replacer(batch_index_map), partial)
+                partial_overviews.append(partial.strip())
+            except Exception as e:
+                logger.warning(
+                    f"Failed to generate partial overview batch "
+                    f"{batch_idx + 1}/{len(batches)} for {dir_uri}: {e}"
+                )
+
+        if not partial_overviews:
+            return f"# {dir_name}\n\nDirectory overview"
+
+        # If only one batch succeeded, use it directly
+        if len(partial_overviews) == 1:
+            return partial_overviews[0]
+
+        # Merge partials into a final overview (include children for context)
+        combined = "\n\n---\n\n".join(partial_overviews)
+        try:
+            prompt = render_prompt(
+                "semantic.overview_generation",
+                {
+                    "dir_name": dir_name,
+                    "file_summaries": combined,
+                    "children_abstracts": children_abstracts_str,
+                },
+            )
+            overview = await vlm.get_completion_async(prompt)
+            return overview.strip()
+        except Exception as e:
+            logger.error(
+                f"Failed to merge partial overviews for {dir_uri}: {e}",
+                exc_info=True,
+            )
+            return partial_overviews[0]
+
     async def _vectorize_directory(
         self,
         uri: str,

diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py
@@ -26,6 +26,7 @@
     ImageConfig,
     MarkdownConfig,
     PDFConfig,
+    SemanticConfig,
     TextConfig,
     VideoConfig,
 )
@@ -94,6 +95,11 @@ class OpenVikingConfig(BaseModel):
         default_factory=lambda: DirectoryConfig(), description="Directory parsing configuration"
     )
 
+    semantic: SemanticConfig = Field(
+        default_factory=lambda: SemanticConfig(),
+        description="Semantic processing configuration (overview/abstract limits)",
+    )
+
     auto_generate_l0: bool = Field(
         default=True, description="Automatically generate L0 (abstract) if not provided"
     )

diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py
@@ -480,6 +480,32 @@ class DirectoryConfig(ParserConfig):
     preserve_structure: bool = True
 
 
+@dataclass
+class SemanticConfig:
+    """
+    Configuration for semantic processing (overview/abstract generation).
+
+    Controls prompt budget limits and output size constraints for the
+    SemanticProcessor pipeline.
+    """
+
+    max_file_content_chars: int = 30000
+    """Maximum characters of file content sent to LLM for summary generation."""
+
+    max_overview_prompt_chars: int = 60000
+    """Maximum characters allowed in the overview generation prompt.
+    If exceeded, file summaries are batched and merged."""
+
+    overview_batch_size: int = 50
+    """Maximum number of file summaries per batch when splitting oversized prompts."""
+
+    abstract_max_chars: int = 256
+    """Maximum characters for generated abstracts."""
+
+    overview_max_chars: int = 4000
+    """Maximum characters for generated overviews."""
+
+
 # Configuration registry for dynamic loading
 PARSER_CONFIG_REGISTRY = {
     "pdf": PDFConfig,

diff --git a/tests/misc/test_semantic_config.py b/tests/misc/test_semantic_config.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for SemanticConfig and overview budget estimation."""
+
+from openviking_cli.utils.config.parser_config import SemanticConfig
+
+
+def test_semantic_config_defaults():
+    """Test default values match previously hardcoded constants."""
+    config = SemanticConfig()
+    assert config.max_file_content_chars == 30000
+    assert config.max_overview_prompt_chars == 60000
+    assert config.overview_batch_size == 50
+    assert config.abstract_max_chars == 256
+    assert config.overview_max_chars == 4000
+
+
+def test_semantic_config_custom_values():
+    """Test custom values override defaults."""
+    config = SemanticConfig(
+        max_overview_prompt_chars=100000,
+        overview_batch_size=100,
+    )
+    assert config.max_overview_prompt_chars == 100000
+    assert config.overview_batch_size == 100
+    # Unchanged defaults
+    assert config.max_file_content_chars == 30000
+    assert config.abstract_max_chars == 256
+
+
+def test_budget_under_limit_no_batching():
+    """Small directories should not trigger batching."""
+    config = SemanticConfig()
+    # 10 file summaries, each ~100 chars = ~1000 chars total
+    summaries = [{"name": f"file_{i}.py", "summary": "x" * 100} for i in range(10)]
+    total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1))
+    assert total < config.max_overview_prompt_chars
+    assert len(summaries) <= config.overview_batch_size
+
+
+def test_budget_over_limit_triggers_batching():
+    """Large directories should exceed budget and require batching."""
+    config = SemanticConfig()
+    # 200 file summaries, each ~500 chars = ~100000+ chars total
+    summaries = [{"name": f"file_{i}.py", "summary": "x" * 500} for i in range(200)]
+    total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1))
+    assert total > config.max_overview_prompt_chars
+    assert len(summaries) > config.overview_batch_size
+
+
+def test_abstract_truncation():
+    """Test abstract is truncated to abstract_max_chars."""
+    config = SemanticConfig(abstract_max_chars=100)
+    abstract = "x" * 200
+    if len(abstract) > config.abstract_max_chars:
+        abstract = abstract[: config.abstract_max_chars - 3] + "..."
+    assert len(abstract) == 100
+    assert abstract.endswith("...")
+
+
+def test_overview_truncation():
+    """Test overview is truncated to overview_max_chars."""
+    config = SemanticConfig(overview_max_chars=500)
+    overview = "x" * 1000
+    if len(overview) > config.overview_max_chars:
+        overview = overview[: config.overview_max_chars]
+    assert len(overview) == 500
+
+
+def test_batch_splitting():
+    """Test batch splitting logic produces correct batch count."""
+    config = SemanticConfig(overview_batch_size=50)
+    summaries = [{"name": f"f{i}.py", "summary": "s"} for i in range(120)]
+    batches = [
+        summaries[i : i + config.overview_batch_size]
+        for i in range(0, len(summaries), config.overview_batch_size)
+    ]
+    assert len(batches) == 3  # 50 + 50 + 20
+    assert len(batches[0]) == 50
+    assert len(batches[1]) == 50
+    assert len(batches[2]) == 20