Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions openviking/storage/queuefs/semantic_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ async def _overview_task(self, dir_uri: str) -> None:
dir_uri, file_summaries, children_abstracts
)
abstract = self._processor._extract_abstract_from_overview(overview)
overview, abstract = self._processor._enforce_size_limits(overview, abstract)

try:
await self._viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=self._ctx)
Expand Down
191 changes: 185 additions & 6 deletions openviking/storage/queuefs/semantic_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ async def _process_memory_directory(self, msg: SemanticMsg) -> None:

overview = await self._generate_overview(dir_uri, file_summaries, [])
abstract = self._extract_abstract_from_overview(overview)
overview, abstract = self._enforce_size_limits(overview, abstract)

try:
await viking_fs.write_file(f"{dir_uri}/.overview.md", overview, ctx=ctx)
Expand Down Expand Up @@ -577,8 +578,8 @@ async def _generate_text_summary(
logger.warning(f"Failed to decode file as UTF-8, skipping: {file_path}")
return {"name": file_name, "summary": ""}

# Limit content length (about 10000 tokens)
max_chars = 30000
# Limit content length
max_chars = get_openviking_config().semantic.max_file_content_chars
if len(content) > max_chars:
content = content[:max_chars] + "\n...(truncated)"

Expand Down Expand Up @@ -686,6 +687,15 @@ def _extract_abstract_from_overview(self, overview_content: str) -> str:

return "\n".join(content_lines).strip()

def _enforce_size_limits(self, overview: str, abstract: str) -> Tuple[str, str]:
"""Enforce max size limits on overview and abstract."""
semantic = get_openviking_config().semantic
if len(overview) > semantic.overview_max_chars:
overview = overview[: semantic.overview_max_chars]
if len(abstract) > semantic.abstract_max_chars:
abstract = abstract[: semantic.abstract_max_chars - 3] + "..."
return overview, abstract

def _parse_overview_md(self, overview_content: str) -> Dict[str, str]:
"""Parse overview.md and extract file summaries.

Expand Down Expand Up @@ -747,6 +757,11 @@ async def _generate_overview(
) -> str:
"""Generate directory's .overview.md (L1).

For small directories, generates a single overview from all file summaries.
For large directories that would exceed the prompt budget, splits file
summaries into batches, generates a partial overview per batch, then
merges the partials into a final overview.

Args:
dir_uri: Directory URI
file_summaries: File summary list
Expand All @@ -755,9 +770,10 @@ async def _generate_overview(
Returns:
Overview content
"""
import re

vlm = get_openviking_config().vlm
config = get_openviking_config()
vlm = config.vlm
semantic = config.semantic

if not vlm.is_available():
logger.warning("VLM not available, using default overview")
Expand All @@ -778,7 +794,64 @@ async def _generate_overview(
else "None"
)

# Generate overview
# Budget guard: check if prompt would be oversized
estimated_size = len(file_summaries_str) + len(children_abstracts_str)
over_budget = estimated_size > semantic.max_overview_prompt_chars
many_files = len(file_summaries) > semantic.overview_batch_size

if over_budget and many_files:
# Many files, oversized prompt → batch and merge
logger.info(
f"Overview prompt for {dir_uri} exceeds budget "
f"({estimated_size} chars, {len(file_summaries)} files). "
f"Splitting into batches of {semantic.overview_batch_size}."
)
overview = await self._batched_generate_overview(
dir_uri, file_summaries, children_abstracts, file_index_map
)
elif over_budget:
# Few files but long summaries → truncate summaries to fit budget
logger.info(
f"Overview prompt for {dir_uri} exceeds budget "
f"({estimated_size} chars) with {len(file_summaries)} files. "
f"Truncating summaries to fit."
)
budget = semantic.max_overview_prompt_chars
budget -= len(children_abstracts_str)
per_file = max(100, budget // max(len(file_summaries), 1))
truncated_lines = []
for idx, item in enumerate(file_summaries, 1):
summary = item["summary"][:per_file]
truncated_lines.append(f"[{idx}] {item['name']}: {summary}")
file_summaries_str = "\n".join(truncated_lines)
overview = await self._single_generate_overview(
dir_uri,
file_summaries_str,
children_abstracts_str,
file_index_map,
)
else:
overview = await self._single_generate_overview(
dir_uri,
file_summaries_str,
children_abstracts_str,
file_index_map,
)

return overview

async def _single_generate_overview(
self,
dir_uri: str,
file_summaries_str: str,
children_abstracts_str: str,
file_index_map: Dict[int, str],
) -> str:
"""Generate overview from a single prompt (small directories)."""
import re

vlm = get_openviking_config().vlm

try:
prompt = render_prompt(
"semantic.overview_generation",
Expand All @@ -801,9 +874,115 @@ def replace_index(match):
return overview.strip()

except Exception as e:
logger.error(f"Failed to generate overview for {dir_uri}: {e}", exc_info=True)
logger.error(
f"Failed to generate overview for {dir_uri}: {e}",
exc_info=True,
)
return f"# {dir_uri.split('/')[-1]}\n\nDirectory overview"

async def _batched_generate_overview(
self,
dir_uri: str,
file_summaries: List[Dict[str, str]],
children_abstracts: List[Dict[str, str]],
file_index_map: Dict[int, str],
) -> str:
"""Generate overview by batching file summaries and merging.

Splits file summaries into batches, generates a partial overview per
batch, then merges all partials into a final overview.
"""
import re

vlm = get_openviking_config().vlm
semantic = get_openviking_config().semantic
batch_size = semantic.overview_batch_size
dir_name = dir_uri.split("/")[-1]

# Split file summaries into batches
batches = [
file_summaries[i : i + batch_size] for i in range(0, len(file_summaries), batch_size)
]
logger.info(f"Generating overview for {dir_uri} in {len(batches)} batches")

# Build children abstracts string (used in first batch + merge)
children_abstracts_str = (
"\n".join(f"- {item['name']}/: {item['abstract']}" for item in children_abstracts)
if children_abstracts
else "None"
)

# Generate partial overview per batch using global file indices
partial_overviews = []
global_offset = 0
for batch_idx, batch in enumerate(batches):
# Build per-batch index map using global offsets
batch_lines = []
batch_index_map = {}
for local_idx, item in enumerate(batch):
global_idx = global_offset + local_idx + 1
batch_index_map[global_idx] = item["name"]
batch_lines.append(f"[{global_idx}] {item['name']}: {item['summary']}")
batch_str = "\n".join(batch_lines)
global_offset += len(batch)

# Include children abstracts in the first batch
children_str = children_abstracts_str if batch_idx == 0 else "None"

try:
prompt = render_prompt(
"semantic.overview_generation",
{
"dir_name": dir_name,
"file_summaries": batch_str,
"children_abstracts": children_str,
},
)
partial = await vlm.get_completion_async(prompt)

# Replace [number] references per batch using batch-local map
def make_replacer(idx_map):
def replacer(match):
idx = int(match.group(1))
return idx_map.get(idx, match.group(0))

return replacer

partial = re.sub(r"\[(\d+)\]", make_replacer(batch_index_map), partial)
partial_overviews.append(partial.strip())
except Exception as e:
logger.warning(
f"Failed to generate partial overview batch "
f"{batch_idx + 1}/{len(batches)} for {dir_uri}: {e}"
)

if not partial_overviews:
return f"# {dir_name}\n\nDirectory overview"

# If only one batch succeeded, use it directly
if len(partial_overviews) == 1:
return partial_overviews[0]

# Merge partials into a final overview (include children for context)
combined = "\n\n---\n\n".join(partial_overviews)
try:
prompt = render_prompt(
"semantic.overview_generation",
{
"dir_name": dir_name,
"file_summaries": combined,
"children_abstracts": children_abstracts_str,
},
)
overview = await vlm.get_completion_async(prompt)
return overview.strip()
except Exception as e:
logger.error(
f"Failed to merge partial overviews for {dir_uri}: {e}",
exc_info=True,
)
return partial_overviews[0]

async def _vectorize_directory(
self,
uri: str,
Expand Down
6 changes: 6 additions & 0 deletions openviking_cli/utils/config/open_viking_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ImageConfig,
MarkdownConfig,
PDFConfig,
SemanticConfig,
TextConfig,
VideoConfig,
)
Expand Down Expand Up @@ -94,6 +95,11 @@ class OpenVikingConfig(BaseModel):
default_factory=lambda: DirectoryConfig(), description="Directory parsing configuration"
)

semantic: SemanticConfig = Field(
default_factory=lambda: SemanticConfig(),
description="Semantic processing configuration (overview/abstract limits)",
)

auto_generate_l0: bool = Field(
default=True, description="Automatically generate L0 (abstract) if not provided"
)
Expand Down
26 changes: 26 additions & 0 deletions openviking_cli/utils/config/parser_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,32 @@ class DirectoryConfig(ParserConfig):
preserve_structure: bool = True


@dataclass
class SemanticConfig:
"""
Configuration for semantic processing (overview/abstract generation).

Controls prompt budget limits and output size constraints for the
SemanticProcessor pipeline.
"""

max_file_content_chars: int = 30000
"""Maximum characters of file content sent to LLM for summary generation."""

max_overview_prompt_chars: int = 60000
"""Maximum characters allowed in the overview generation prompt.
If exceeded, file summaries are batched and merged."""

overview_batch_size: int = 50
"""Maximum number of file summaries per batch when splitting oversized prompts."""

abstract_max_chars: int = 256
"""Maximum characters for generated abstracts."""

overview_max_chars: int = 4000
"""Maximum characters for generated overviews."""


# Configuration registry for dynamic loading
PARSER_CONFIG_REGISTRY = {
"pdf": PDFConfig,
Expand Down
82 changes: 82 additions & 0 deletions tests/misc/test_semantic_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: Apache-2.0

"""Tests for SemanticConfig and overview budget estimation."""

from openviking_cli.utils.config.parser_config import SemanticConfig


def test_semantic_config_defaults():
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Suggestion] (non-blocking) The tests here thoroughly validate SemanticConfig defaults and budget arithmetic, which is good. However, there are no tests for the core behavioral changes: _batched_generate_overview index mapping, the merge step, or the truncation enforcement in semantic_dag.py / semantic_processor.py.

Consider adding at least one test with a mocked VLM that verifies:

  • Batch splitting produces correct per-batch index numbering
  • The [number] → file name replacement is correct across batches
  • Truncation is applied when overview/abstract exceeds limits

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged — will add behavioral tests with mocked VLM in a follow-up if needed. The current tests validate the config, budget logic, and batch splitting which are the core pure-function components.

"""Test default values match previously hardcoded constants."""
config = SemanticConfig()
assert config.max_file_content_chars == 30000
assert config.max_overview_prompt_chars == 60000
assert config.overview_batch_size == 50
assert config.abstract_max_chars == 256
assert config.overview_max_chars == 4000


def test_semantic_config_custom_values():
"""Test custom values override defaults."""
config = SemanticConfig(
max_overview_prompt_chars=100000,
overview_batch_size=100,
)
assert config.max_overview_prompt_chars == 100000
assert config.overview_batch_size == 100
# Unchanged defaults
assert config.max_file_content_chars == 30000
assert config.abstract_max_chars == 256


def test_budget_under_limit_no_batching():
"""Small directories should not trigger batching."""
config = SemanticConfig()
# 10 file summaries, each ~100 chars = ~1000 chars total
summaries = [{"name": f"file_{i}.py", "summary": "x" * 100} for i in range(10)]
total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1))
assert total < config.max_overview_prompt_chars
assert len(summaries) <= config.overview_batch_size


def test_budget_over_limit_triggers_batching():
"""Large directories should exceed budget and require batching."""
config = SemanticConfig()
# 200 file summaries, each ~500 chars = ~100000+ chars total
summaries = [{"name": f"file_{i}.py", "summary": "x" * 500} for i in range(200)]
total = sum(len(f"[{i}] {s['name']}: {s['summary']}") for i, s in enumerate(summaries, 1))
assert total > config.max_overview_prompt_chars
assert len(summaries) > config.overview_batch_size


def test_abstract_truncation():
"""Test abstract is truncated to abstract_max_chars."""
config = SemanticConfig(abstract_max_chars=100)
abstract = "x" * 200
if len(abstract) > config.abstract_max_chars:
abstract = abstract[: config.abstract_max_chars - 3] + "..."
assert len(abstract) == 100
assert abstract.endswith("...")


def test_overview_truncation():
"""Test overview is truncated to overview_max_chars."""
config = SemanticConfig(overview_max_chars=500)
overview = "x" * 1000
if len(overview) > config.overview_max_chars:
overview = overview[: config.overview_max_chars]
assert len(overview) == 500


def test_batch_splitting():
"""Test batch splitting logic produces correct batch count."""
config = SemanticConfig(overview_batch_size=50)
summaries = [{"name": f"f{i}.py", "summary": "s"} for i in range(120)]
batches = [
summaries[i : i + config.overview_batch_size]
for i in range(0, len(summaries), config.overview_batch_size)
]
assert len(batches) == 3 # 50 + 50 + 20
assert len(batches[0]) == 50
assert len(batches[1]) == 50
assert len(batches[2]) == 20
Loading