diff --git a/openviking/storage/queuefs/semantic_dag.py b/openviking/storage/queuefs/semantic_dag.py index 9ed037b6d..b1e6407ca 100644 --- a/openviking/storage/queuefs/semantic_dag.py +++ b/openviking/storage/queuefs/semantic_dag.py @@ -530,6 +530,7 @@ async def _overview_task(self, dir_uri: str) -> None: dir_uri, file_summaries, children_abstracts ) abstract = self._processor._extract_abstract_from_overview(overview) + overview, abstract = self._processor._enforce_size_limits(overview, abstract) try: await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx) diff --git a/openviking/storage/queuefs/semantic_processor.py b/openviking/storage/queuefs/semantic_processor.py index 60dc06e64..b4c9ac5c6 100644 --- a/openviking/storage/queuefs/semantic_processor.py +++ b/openviking/storage/queuefs/semantic_processor.py @@ -355,6 +355,7 @@ async def _process_memory_directory(self, msg: SemanticMsg) -> None: overview = await self._generate_overview(dir_uri, file_summaries, []) abstract = self._extract_abstract_from_overview(overview) + overview, abstract = self._enforce_size_limits(overview, abstract) try: await viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=ctx) @@ -577,8 +578,8 @@ async def _generate_text_summary( logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}") return {"name": file_name, "summary": ""} - # Limit content length (about 10000 tokens) - max_chars = 30000 + # Limit content length + max_chars = get_openviking_config().semantic.max_file_content_chars if len(content) > max_chars: content = content[:max_chars] + "\n...(truncated)" @@ -686,6 +687,15 @@ def _extract_abstract_from_overview(self, overview_content: str) -> str: return "\n".join(content_lines).strip() + def _enforce_size_limits(self, overview: str, abstract: str) -> Tuple[str, str]: + """Enforce max size limits on overview and abstract.""" + semantic = get_openviking_config().semantic + if len(overview) > semantic.overview_max_chars: + overview = overview[: semantic.overview_max_chars] + if len(abstract) > semantic.abstract_max_chars: + abstract = abstract[: semantic.abstract_max_chars - 3] + "..." + return overview, abstract + def _parse_overview_md(self, overview_content: str) -> Dict[str, str]: """Parse overview.md and extract file summaries. @@ -747,6 +757,11 @@ async def _generate_overview( ) -> str: """Generate directory's .overview.md (L1). + For small directories, generates a single overview from all file summaries. + For large directories that would exceed the prompt budget, splits file + summaries into batches, generates a partial overview per batch, then + merges the partials into a final overview. + Args: dir_uri: Directory URI file_summaries: File summary list @@ -755,9 +770,10 @@ async def _generate_overview( Returns: Overview content """ - import re - vlm = get_openviking_config().vlm + config = get_openviking_config() + vlm = config.vlm + semantic = config.semantic if not vlm.is_available(): logger.warning("VLM not available, using default overview") @@ -778,7 +794,64 @@ async def _generate_overview( else "None" ) - # Generate overview + # Budget guard: check if prompt would be oversized + estimated_size = len(file_summaries_str) + len(children_abstracts_str) + over_budget = estimated_size > semantic.max_overview_prompt_chars + many_files = len(file_summaries) > semantic.overview_batch_size + + if over_budget and many_files: + # Many files, oversized prompt → batch and merge + logger.info( + f"Overview prompt for {dir_uri} exceeds budget " + f"({estimated_size} chars, {len(file_summaries)} files). " + f"Splitting into batches of {semantic.overview_batch_size}." + ) + overview = await self._batched_generate_overview( + dir_uri, file_summaries, children_abstracts, file_index_map + ) + elif over_budget: + # Few files but long summaries → truncate summaries to fit budget + logger.info( + f"Overview prompt for {dir_uri} exceeds budget " + f"({estimated_size} chars) with {len(file_summaries)} files. " + f"Truncating summaries to fit." + ) + budget = semantic.max_overview_prompt_chars + budget -= len(children_abstracts_str) + per_file = max(100, budget // max(len(file_summaries), 1)) + truncated_lines = [] + for idx, item in enumerate(file_summaries, 1): + summary = item["summary"][:per_file] + truncated_lines.append(f"[{idx}] {item['name']}: {summary}") + file_summaries_str = "\n".join(truncated_lines) + overview = await self._single_generate_overview( + dir_uri, + file_summaries_str, + children_abstracts_str, + file_index_map, + ) + else: + overview = await self._single_generate_overview( + dir_uri, + file_summaries_str, + children_abstracts_str, + file_index_map, + ) + + return overview + + async def _single_generate_overview( + self, + dir_uri: str, + file_summaries_str: str, + children_abstracts_str: str, + file_index_map: Dict[int, str], + ) -> str: + """Generate overview from a single prompt (small directories).""" + import re + + vlm = get_openviking_config().vlm + try: prompt = render_prompt( "semantic.overview_generation", @@ -801,9 +874,115 @@ def replace_index(match): return overview.strip() except Exception as e: - logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True) + logger.error( + f"Failed to generate overview for {dir_uri}: {e}", + exc_info=True, + ) return f"# {dir_uri.split('/')[-1]}\n\nDirectory overview" + async def _batched_generate_overview( + self, + dir_uri: str, + file_summaries: List[Dict[str, str]], + children_abstracts: List[Dict[str, str]], + file_index_map: Dict[int, str], + ) -> str: + """Generate overview by batching file summaries and merging. + + Splits file summaries into batches, generates a partial overview per + batch, then merges all partials into a final overview. + """ + import re + + vlm = get_openviking_config().vlm + semantic = get_openviking_config().semantic + batch_size = semantic.overview_batch_size + dir_name = dir_uri.split("/")[-1] + + # Split file summaries into batches + batches = [ + file_summaries[i : i + batch_size] for i in range(0, len(file_summaries), batch_size) + ] + logger.info(f"Generating overview for {dir_uri} in {len(batches)} batches") + + # Build children abstracts string (used in first batch + merge) + children_abstracts_str = ( + "\n".join(f"- {item['name']}/: {item['abstract']}" for item in children_abstracts) + if children_abstracts + else "None" + ) + + # Generate partial overview per batch using global file indices + partial_overviews = [] + global_offset = 0 + for batch_idx, batch in enumerate(batches): + # Build per-batch index map using global offsets + batch_lines = [] + batch_index_map = {} + for local_idx, item in enumerate(batch): + global_idx = global_offset + local_idx + 1 + batch_index_map[global_idx] = item["name"] + batch_lines.append(f"[{global_idx}] {item['name']}: {item['summary']}") + batch_str = "\n".join(batch_lines) + global_offset += len(batch) + + # Include children abstracts in the first batch + children_str = children_abstracts_str if batch_idx == 0 else "None" + + try: + prompt = render_prompt( + "semantic.overview_generation", + { + "dir_name": dir_name, + "file_summaries": batch_str, + "children_abstracts": children_str, + }, + ) + partial = await vlm.get_completion_async(prompt) + + # Replace [number] references per batch using batch-local map + def make_replacer(idx_map): + def replacer(match): + idx = int(match.group(1)) + return idx_map.get(idx, match.group(0)) + + return replacer + + partial = re.sub(r"\[(\d+)\]", make_replacer(batch_index_map), partial) + partial_overviews.append(partial.strip()) + except Exception as e: + logger.warning( + f"Failed to generate partial overview batch " + f"{batch_idx + 1}/{len(batches)} for {dir_uri}: {e}" + ) + + if not partial_overviews: + return f"# {dir_name}\n\nDirectory overview" + + # If only one batch succeeded, use it directly + if len(partial_overviews) == 1: + return partial_overviews[0] + + # Merge partials into a final overview (include children for context) + combined = "\n\n---\n\n".join(partial_overviews) + try: + prompt = render_prompt( + "semantic.overview_generation", + { + "dir_name": dir_name, + "file_summaries": combined, + "children_abstracts": children_abstracts_str, + }, + ) + overview = await vlm.get_completion_async(prompt) + return overview.strip() + except Exception as e: + logger.error( + f"Failed to merge partial overviews for {dir_uri}: {e}", + exc_info=True, + ) + return partial_overviews[0] + async def _vectorize_directory( self, uri: str, diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py index 9745f0171..784c31302 100644 --- a/openviking_cli/utils/config/open_viking_config.py +++ b/openviking_cli/utils/config/open_viking_config.py @@ -26,6 +26,7 @@ ImageConfig, MarkdownConfig, PDFConfig, + SemanticConfig, TextConfig, VideoConfig, ) @@ -94,6 +95,11 @@ class OpenVikingConfig(BaseModel): default_factory=lambda: DirectoryConfig(), description="Directory parsing configuration" ) + semantic: SemanticConfig = Field( + default_factory=lambda: SemanticConfig(), + description="Semantic processing configuration (overview/abstract limits)", + ) + auto_generate_l0: bool = Field( default=True, description="Automatically generate L0 (abstract) if not provided" ) diff --git a/openviking_cli/utils/config/parser_config.py b/openviking_cli/utils/config/parser_config.py index 2ec28f735..c8ff46aab 100644 --- a/openviking_cli/utils/config/parser_config.py +++ b/openviking_cli/utils/config/parser_config.py @@ -480,6 +480,32 @@ class DirectoryConfig(ParserConfig): preserve_structure: bool = True +@dataclass +class SemanticConfig: + """ + Configuration for semantic processing (overview/abstract generation). + + Controls prompt budget limits and output size constraints for the + SemanticProcessor pipeline. + """ + + max_file_content_chars: int = 30000 + """Maximum characters of file content sent to LLM for summary generation.""" + + max_overview_prompt_chars: int = 60000 + """Maximum characters allowed in the overview generation prompt. + If exceeded, file summaries are batched and merged.""" + + overview_batch_size: int = 50 + """Maximum number of file summaries per batch when splitting oversized prompts.""" + + abstract_max_chars: int = 256 + """Maximum characters for generated abstracts.""" + + overview_max_chars: int = 4000 + """Maximum characters for generated overviews.""" + + # Configuration registry for dynamic loading PARSER_CONFIG_REGISTRY = { "pdf": PDFConfig, diff --git a/tests/misc/test_semantic_config.py b/tests/misc/test_semantic_config.py new file mode 100644 index 000000000..0a974dc87 --- /dev/null +++ b/tests/misc/test_semantic_config.py @@ -0,0 +1,82 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for SemanticConfig and overview budget estimation.""" + +from openviking_cli.utils.config.parser_config import SemanticConfig + + +def test_semantic_config_defaults(): + """Test default values match previously hardcoded constants.""" + config = SemanticConfig() + assert config.max_file_content_chars == 30000 + assert config.max_overview_prompt_chars == 60000 + assert config.overview_batch_size == 50 + assert config.abstract_max_chars == 256 + assert config.overview_max_chars == 4000 + + +def test_semantic_config_custom_values(): + """Test custom values override defaults.""" + config = SemanticConfig( + max_overview_prompt_chars=100000, + overview_batch_size=100, + ) + assert config.max_overview_prompt_chars == 100000 + assert config.overview_batch_size == 100 + # Unchanged defaults + assert config.max_file_content_chars == 30000 + assert config.abstract_max_chars == 256 + + +def test_budget_under_limit_no_batching(): + """Small directories should not trigger batching.""" + config = SemanticConfig() + # 10 file summaries, each ~100 chars = ~1000 chars total + summaries = [{"name": f"file_{i}.py", "summary": "x" * 100} for i in range(10)] + total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1)) + assert total < config.max_overview_prompt_chars + assert len(summaries) <= config.overview_batch_size + + +def test_budget_over_limit_triggers_batching(): + """Large directories should exceed budget and require batching.""" + config = SemanticConfig() + # 200 file summaries, each ~500 chars = ~100000+ chars total + summaries = [{"name": f"file_{i}.py", "summary": "x" * 500} for i in range(200)] + total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1)) + assert total > config.max_overview_prompt_chars + assert len(summaries) > config.overview_batch_size + + +def test_abstract_truncation(): + """Test abstract is truncated to abstract_max_chars.""" + config = SemanticConfig(abstract_max_chars=100) + abstract = "x" * 200 + if len(abstract) > config.abstract_max_chars: + abstract = abstract[: config.abstract_max_chars - 3] + "..." + assert len(abstract) == 100 + assert abstract.endswith("...") + + +def test_overview_truncation(): + """Test overview is truncated to overview_max_chars.""" + config = SemanticConfig(overview_max_chars=500) + overview = "x" * 1000 + if len(overview) > config.overview_max_chars: + overview = overview[: config.overview_max_chars] + assert len(overview) == 500 + + +def test_batch_splitting(): + """Test batch splitting logic produces correct batch count.""" + config = SemanticConfig(overview_batch_size=50) + summaries = [{"name": f"f{i}.py", "summary": "s"} for i in range(120)] + batches = [ + summaries[i : i + config.overview_batch_size] + for i in range(0, len(summaries), config.overview_batch_size) + ] + assert len(batches) == 3 # 50 + 50 + 20 + assert len(batches[0]) == 50 + assert len(batches[1]) == 50 + assert len(batches[2]) == 20