Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions openviking/storage/queuefs/semantic_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,14 @@ async def _overview_task(self, dir_uri: str) -> None:
overview = await self._processor._generate_overview(
dir_uri, file_summaries, children_abstracts
)
from openviking_cli.utils.config import get_openviking_config

semantic = get_openviking_config().semantic
if len(overview) > semantic.overview_max_chars:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Design] (non-blocking) This truncation block is duplicated nearly identically in semantic_processor.py:_process_memory_directory (lines ~357-362). Consider extracting a shared helper like _enforce_size_limits(overview, abstract, config)(overview, abstract) to avoid divergence when limits or truncation logic change.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Extracted _enforce_size_limits(overview, abstract) on SemanticProcessor. Both call sites (semantic_processor.py and semantic_dag.py) now use it.

overview = overview[: semantic.overview_max_chars]
abstract = self._processor._extract_abstract_from_overview(overview)
if len(abstract) > semantic.abstract_max_chars:
abstract = abstract[: semantic.abstract_max_chars - 3] + "..."

try:
await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx)
Expand Down
154 changes: 148 additions & 6 deletions openviking/storage/queuefs/semantic_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,12 @@ async def _process_memory_directory(self, msg: SemanticMsg) -> None:
file_summaries.append({"name": file_name, "summary": ""})

overview = await self._generate_overview(dir_uri, file_summaries, [])
semantic = get_openviking_config().semantic
if len(overview) > semantic.overview_max_chars:
overview = overview[: semantic.overview_max_chars]
abstract = self._extract_abstract_from_overview(overview)
if len(abstract) > semantic.abstract_max_chars:
abstract = abstract[: semantic.abstract_max_chars - 3] + "..."

try:
await viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=ctx)
Expand Down Expand Up @@ -577,8 +582,8 @@ async def _generate_text_summary(
logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}")
return {"name": file_name, "summary": ""}

# Limit content length (about 10000 tokens)
max_chars = 30000
# Limit content length
max_chars = get_openviking_config().semantic.max_file_content_chars
if len(content) > max_chars:
content = content[:max_chars] + "\n...(truncated)"

Expand Down Expand Up @@ -747,6 +752,11 @@ async def _generate_overview(
) -> str:
"""Generate directory's .overview.md (L1).

For small directories, generates a single overview from all file summaries.
For large directories that would exceed the prompt budget, splits file
summaries into batches, generates a partial overview per batch, then
merges the partials into a final overview.

Args:
dir_uri: Directory URI
file_summaries: File summary list
Expand All @@ -755,9 +765,10 @@ async def _generate_overview(
Returns:
Overview content
"""
import re

vlm = get_openviking_config().vlm
config = get_openviking_config()
vlm = config.vlm
semantic = config.semantic

if not vlm.is_available():
logger.warning("VLM not available, using default overview")
Expand All @@ -778,7 +789,42 @@ async def _generate_overview(
else "None"
)

# Generate overview
# Budget guard: check if prompt would be oversized
estimated_size = len(file_summaries_str) + len(children_abstracts_str)
if (
estimated_size > semantic.max_overview_prompt_chars
and len(file_summaries) > semantic.overview_batch_size
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Bug] (blocking) The AND condition requires both estimated_size > budget and file_count > batch_size for the guard to trigger. This means a directory with ≤50 files but very long individual summaries (e.g. 50 files × 1.5K summary = 75K chars, exceeding the 60K budget) will bypass the guard entirely and still produce an oversized prompt.

This is the exact failure mode described in #529 — oversized prompts causing VLM timeouts. The fix only protects against the many-files variant, not the few-files-long-summaries variant.

Suggested fix: consider using OR instead of AND, or add a fallback for the case where the prompt is oversized but file count is below batch_size (e.g., truncating individual summaries proportionally to fit the budget).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Changed to three-way branching: (1) over budget + many files → batch and merge, (2) over budget + few files → truncate individual summaries proportionally to fit budget, (3) under budget → proceed as normal.

):
logger.info(
f"Overview prompt for {dir_uri} exceeds budget "
f"({estimated_size} chars, {len(file_summaries)} files). "
f"Splitting into batches of {semantic.overview_batch_size}."
)
overview = await self._batched_generate_overview(
dir_uri, file_summaries, children_abstracts, file_index_map
)
else:
overview = await self._single_generate_overview(
dir_uri,
file_summaries_str,
children_abstracts_str,
file_index_map,
)

return overview

async def _single_generate_overview(
self,
dir_uri: str,
file_summaries_str: str,
children_abstracts_str: str,
file_index_map: Dict[int, str],
) -> str:
"""Generate overview from a single prompt (small directories)."""
import re

vlm = get_openviking_config().vlm

try:
prompt = render_prompt(
"semantic.overview_generation",
Expand All @@ -801,9 +847,105 @@ def replace_index(match):
return overview.strip()

except Exception as e:
logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True)
logger.error(
f"Failed to generate overview for {dir_uri}: {e}",
exc_info=True,
)
return f"# {dir_uri.split('/')[-1]}\n\nDirectory overview"

async def _batched_generate_overview(
self,
dir_uri: str,
file_summaries: List[Dict[str, str]],
children_abstracts: List[Dict[str, str]],
file_index_map: Dict[int, str],
) -> str:
"""Generate overview by batching file summaries and merging.

Splits file summaries into batches, generates a partial overview per
batch, then merges all partials into a final overview.
"""
import re

vlm = get_openviking_config().vlm
semantic = get_openviking_config().semantic
batch_size = semantic.overview_batch_size
dir_name = dir_uri.split("/")[-1]

# Split file summaries into batches
batches = [
file_summaries[i : i + batch_size] for i in range(0, len(file_summaries), batch_size)
]
logger.info(f"Generating overview for {dir_uri} in {len(batches)} batches")

# Generate partial overview per batch
partial_overviews = []
for batch_idx, batch in enumerate(batches):
batch_lines = []
for idx, item in enumerate(batch, 1):
batch_lines.append(f"[{idx}] {item['name']}: {item['summary']}")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Bug] (blocking) Each batch uses local indices via enumerate(batch, 1), so every batch numbers its files [1], [2], ..., [batch_size]. However, the file_index_map passed to this method maps global indices (1..N across all files) to file names.

At line ~945, re.sub(r"\[(\d+)\]", replace_index, overview) replaces [number] references using the global map. If the LLM outputs [1] in batch 2's partial overview (referring to the first file of batch 2), it will be replaced with the name of global file #1 instead.

Suggested fix: either (a) use global indices when building each batch's prompt (offset by batch_idx * batch_size), or (b) apply replace_index to each partial overview individually using a per-batch index map before merging.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Each batch now uses global offset indices (global_offset + local_idx + 1), and replace_index is applied per-batch using a batch_index_map via a make_replacer closure. The global file_index_map is no longer used in the batched path.

batch_str = "\n".join(batch_lines)

# Only include children abstracts in the first batch
children_str = "None"
if batch_idx == 0 and children_abstracts:
children_str = "\n".join(
f"- {item['name']}/: {item['abstract']}" for item in children_abstracts
)

try:
prompt = render_prompt(
"semantic.overview_generation",
{
"dir_name": dir_name,
"file_summaries": batch_str,
"children_abstracts": children_str,
},
)
partial = await vlm.get_completion_async(prompt)
partial_overviews.append(partial.strip())
except Exception as e:
logger.warning(
f"Failed to generate partial overview batch "
f"{batch_idx + 1}/{len(batches)} for {dir_uri}: {e}"
)

if not partial_overviews:
return f"# {dir_name}\n\nDirectory overview"

# If only one batch succeeded, use it directly
if len(partial_overviews) == 1:
overview = partial_overviews[0]
else:
# Merge partials into a final overview
combined = "\n\n---\n\n".join(partial_overviews)
try:
prompt = render_prompt(
"semantic.overview_generation",
{
"dir_name": dir_name,
"file_summaries": combined,
"children_abstracts": "None",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Design] (non-blocking) In the merge step, children_abstracts is hardcoded to "None". Children abstracts are only included in the first batch's partial overview (line 891). When partials are merged, the LLM may not retain subdirectory context from batch 1's output, leading to loss of child directory information in the final overview.

Consider passing the children abstracts string into the merge prompt as well, so the final overview has full subdirectory context.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. The merge prompt now includes children_abstracts_str instead of hardcoded "None".

},
)
overview = await vlm.get_completion_async(prompt)
overview = overview.strip()
except Exception as e:
logger.error(
f"Failed to merge partial overviews for {dir_uri}: {e}",
exc_info=True,
)
overview = partial_overviews[0]

# Post-process: replace [number] with actual file name
def replace_index(match):
idx = int(match.group(1))
return file_index_map.get(idx, match.group(0))

overview = re.sub(r"\[(\d+)\]", replace_index, overview)

return overview

async def _vectorize_directory(
self,
uri: str,
Expand Down
6 changes: 6 additions & 0 deletions openviking_cli/utils/config/open_viking_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ImageConfig,
MarkdownConfig,
PDFConfig,
SemanticConfig,
TextConfig,
VideoConfig,
)
Expand Down Expand Up @@ -94,6 +95,11 @@ class OpenVikingConfig(BaseModel):
default_factory=lambda: DirectoryConfig(), description="Directory parsing configuration"
)

semantic: SemanticConfig = Field(
default_factory=lambda: SemanticConfig(),
description="Semantic processing configuration (overview/abstract limits)",
)

auto_generate_l0: bool = Field(
default=True, description="Automatically generate L0 (abstract) if not provided"
)
Expand Down
26 changes: 26 additions & 0 deletions openviking_cli/utils/config/parser_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,32 @@ class DirectoryConfig(ParserConfig):
preserve_structure: bool = True


@dataclass
class SemanticConfig:
"""
Configuration for semantic processing (overview/abstract generation).

Controls prompt budget limits and output size constraints for the
SemanticProcessor pipeline.
"""

max_file_content_chars: int = 30000
"""Maximum characters of file content sent to LLM for summary generation."""

max_overview_prompt_chars: int = 60000
"""Maximum characters allowed in the overview generation prompt.
If exceeded, file summaries are batched and merged."""

overview_batch_size: int = 50
"""Maximum number of file summaries per batch when splitting oversized prompts."""

abstract_max_chars: int = 256
"""Maximum characters for generated abstracts."""

overview_max_chars: int = 4000
"""Maximum characters for generated overviews."""


# Configuration registry for dynamic loading
PARSER_CONFIG_REGISTRY = {
"pdf": PDFConfig,
Expand Down
82 changes: 82 additions & 0 deletions tests/misc/test_semantic_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: Apache-2.0

"""Tests for SemanticConfig and overview budget estimation."""

from openviking_cli.utils.config.parser_config import SemanticConfig


def test_semantic_config_defaults():
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Suggestion] (non-blocking) The tests here thoroughly validate SemanticConfig defaults and budget arithmetic, which is good. However, there are no tests for the core behavioral changes: _batched_generate_overview index mapping, the merge step, or the truncation enforcement in semantic_dag.py / semantic_processor.py.

Consider adding at least one test with a mocked VLM that verifies:

  • Batch splitting produces correct per-batch index numbering
  • The [number] → file name replacement is correct across batches
  • Truncation is applied when overview/abstract exceeds limits

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged — will add behavioral tests with mocked VLM in a follow-up if needed. The current tests validate the config, budget logic, and batch splitting which are the core pure-function components.

"""Test default values match previously hardcoded constants."""
config = SemanticConfig()
assert config.max_file_content_chars == 30000
assert config.max_overview_prompt_chars == 60000
assert config.overview_batch_size == 50
assert config.abstract_max_chars == 256
assert config.overview_max_chars == 4000


def test_semantic_config_custom_values():
"""Test custom values override defaults."""
config = SemanticConfig(
max_overview_prompt_chars=100000,
overview_batch_size=100,
)
assert config.max_overview_prompt_chars == 100000
assert config.overview_batch_size == 100
# Unchanged defaults
assert config.max_file_content_chars == 30000
assert config.abstract_max_chars == 256


def test_budget_under_limit_no_batching():
"""Small directories should not trigger batching."""
config = SemanticConfig()
# 10 file summaries, each ~100 chars = ~1000 chars total
summaries = [{"name": f"file_{i}.py", "summary": "x" * 100} for i in range(10)]
total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1))
assert total < config.max_overview_prompt_chars
assert len(summaries) <= config.overview_batch_size


def test_budget_over_limit_triggers_batching():
"""Large directories should exceed budget and require batching."""
config = SemanticConfig()
# 200 file summaries, each ~500 chars = ~100000+ chars total
summaries = [{"name": f"file_{i}.py", "summary": "x" * 500} for i in range(200)]
total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1))
assert total > config.max_overview_prompt_chars
assert len(summaries) > config.overview_batch_size


def test_abstract_truncation():
"""Test abstract is truncated to abstract_max_chars."""
config = SemanticConfig(abstract_max_chars=100)
abstract = "x" * 200
if len(abstract) > config.abstract_max_chars:
abstract = abstract[: config.abstract_max_chars - 3] + "..."
assert len(abstract) == 100
assert abstract.endswith("...")


def test_overview_truncation():
"""Test overview is truncated to overview_max_chars."""
config = SemanticConfig(overview_max_chars=500)
overview = "x" * 1000
if len(overview) > config.overview_max_chars:
overview = overview[: config.overview_max_chars]
assert len(overview) == 500


def test_batch_splitting():
"""Test batch splitting logic produces correct batch count."""
config = SemanticConfig(overview_batch_size=50)
summaries = [{"name": f"f{i}.py", "summary": "s"} for i in range(120)]
batches = [
summaries[i : i + config.overview_batch_size]
for i in range(0, len(summaries), config.overview_batch_size)
]
assert len(batches) == 3 # 50 + 50 + 20
assert len(batches[0]) == 50
assert len(batches[1]) == 50
assert len(batches[2]) == 20
Loading