Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions pr_split/planner/partitioning.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,147 @@ def _group_units_graph(
return grouped_units


def _group_load(group_units: list[PartitionUnit]) -> int:
return sum(unit.loc for unit in group_units)


def _group_anchor_position(group_units: list[PartitionUnit]) -> int:
return min(unit.position for unit in group_units)


def _group_affinity(
left_group: list[PartitionUnit], right_group: list[PartitionUnit], priority: Priority
) -> int:
return sum(
_affinity_score(left_unit, right_unit, priority)
for left_unit in left_group
for right_unit in right_group
)


def _shared_file_merge_is_contiguous(
grouped_units: list[list[PartitionUnit]], source_idx: int, target_idx: int
) -> bool:
source_files = {unit.file_path for unit in grouped_units[source_idx]}
target_files = {unit.file_path for unit in grouped_units[target_idx]}
shared_files = source_files & target_files

if not shared_files:
return True

for file_path in shared_files:
ordered_occurrences: list[tuple[int, int]] = []
for group_idx, group_units in enumerate(grouped_units):
positions = [unit.position for unit in group_units if unit.file_path == file_path]
if positions:
ordered_occurrences.append((min(positions), group_idx))

occurrence_order = [group_idx for _, group_idx in sorted(ordered_occurrences)]
if abs(occurrence_order.index(source_idx) - occurrence_order.index(target_idx)) != 1:
return False

return True


def _best_graph_merge_target(
grouped_units: list[list[PartitionUnit]], source_idx: int, *, settings: Settings
) -> int | None:
if settings.min_loc is None:
return None

source_group = grouped_units[source_idx]
source_load = _group_load(source_group)
source_underflow = max(0, settings.min_loc - source_load)
if source_underflow == 0:
return None

best_idx: int | None = None
best_key: tuple[int, int, int, int, int, int] | None = None
source_anchor = _group_anchor_position(source_group)

for target_idx, target_group in enumerate(grouped_units):
if target_idx == source_idx:
continue

merged_load = source_load + _group_load(target_group)
if merged_load > settings.max_loc:
continue
if not _shared_file_merge_is_contiguous(grouped_units, source_idx, target_idx):
continue

current_underflow = source_underflow + max(0, settings.min_loc - _group_load(target_group))
merged_underflow = max(0, settings.min_loc - merged_load)
if merged_underflow >= current_underflow:
continue

target_anchor = _group_anchor_position(target_group)
merge_key = (
int(merged_load >= settings.min_loc),
_group_affinity(source_group, target_group, settings.priority),
-abs(source_anchor - target_anchor),
-abs(settings.max_loc - merged_load),
-min(source_idx, target_idx),
-max(source_idx, target_idx),
)
if best_key is None or merge_key > best_key:
best_idx = target_idx
best_key = merge_key

return best_idx


def _merge_group_units(
grouped_units: list[list[PartitionUnit]], source_idx: int, target_idx: int
) -> list[list[PartitionUnit]]:
merged_group = sorted(
grouped_units[target_idx] + grouped_units[source_idx],
key=lambda unit: unit.position,
)
repaired_groups = [list(group_units) for group_units in grouped_units]
repaired_groups[target_idx] = merged_group
del repaired_groups[source_idx]
return repaired_groups
Comment on lines +271 to +278

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation of _merge_group_units is inefficient as it creates a deep copy of all groups ([list(group_units) for group_units in grouped_units]) on every merge. This can be costly if there are many groups. Additionally, using del with indices can be subtle to reason about.

A clearer and more performant approach is to build a new list from scratch, which avoids the expensive copy and the del operation.

Suggested change
merged_group = sorted(
grouped_units[target_idx] + grouped_units[source_idx],
key=lambda unit: unit.position,
)
repaired_groups = [list(group_units) for group_units in grouped_units]
repaired_groups[target_idx] = merged_group
del repaired_groups[source_idx]
return repaired_groups
merged_group = sorted(
grouped_units[target_idx] + grouped_units[source_idx],
key=lambda unit: unit.position,
)
repaired_groups = []
for i, group in enumerate(grouped_units):
if i == source_idx:
continue
if i == target_idx:
repaired_groups.append(merged_group)
else:
repaired_groups.append(group)
return repaired_groups



def _repair_graph_min_loc(
grouped_units: list[list[PartitionUnit]], *, settings: Settings
) -> list[list[PartitionUnit]]:
if settings.min_loc is None or len(grouped_units) < 2:
return grouped_units

repaired_groups = [list(group_units) for group_units in grouped_units]

while True:
undersized_group_indices = sorted(
(
group_idx
for group_idx, group_units in enumerate(repaired_groups)
if _group_load(group_units) < settings.min_loc
),
key=lambda group_idx: (
_group_load(repaired_groups[group_idx]),
_group_anchor_position(repaired_groups[group_idx]),
group_idx,
),
)
Comment on lines +289 to +301
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Greedy source-selection order can miss globally better merge sequences

The undersized_group_indices list is sorted by (load, anchor, group_idx) — smallest-load group is always attempted first as the merge source. When there are three or more undersized groups with different affinities, this greedy priority can leave a higher-affinity merge unreachable.

For example, suppose groups A (load 2), B (load 2), C (load 3) all exist with min_loc=5, max_loc=6. A+C=5 (fits, resolves A's underflow) and B+C=5 (fits, resolves B's underflow), but A+B=4 (still undersized). The algorithm will pick A (smallest load) first. If it finds C as the best target it merges A+C, leaving B alone and unable to merge (B+merged=7 > max_loc). Had B+C been done first, A could still merge with B+C later (2+5=7, over max). Alternatively B+C first, then A is stuck. So here the result is the same, but in more complex scenarios the greedy source order can lead to more residual undersized groups than an alternative ordering would.

This is a known limitation of greedy repair and may be acceptable for the current goals, but it is worth documenting with a comment above the sort so future maintainers understand why this ordering was chosen.

Prompt To Fix With AI
This is a comment left during a code review.
Path: pr_split/planner/partitioning.py
Line: 289-301

Comment:
**Greedy source-selection order can miss globally better merge sequences**

The `undersized_group_indices` list is sorted by `(load, anchor, group_idx)` — smallest-load group is always attempted first as the merge source. When there are three or more undersized groups with different affinities, this greedy priority can leave a higher-affinity merge unreachable.

For example, suppose groups A (load 2), B (load 2), C (load 3) all exist with `min_loc=5`, `max_loc=6`. A+C=5 (fits, resolves A's underflow) and B+C=5 (fits, resolves B's underflow), but A+B=4 (still undersized). The algorithm will pick A (smallest load) first. If it finds C as the best target it merges A+C, leaving B alone and unable to merge (B+merged=7 > max_loc). Had B+C been done first, A could still merge with B+C later (2+5=7, over max). Alternatively B+C first, then A is stuck. So here the result is the same, but in more complex scenarios the greedy source order can lead to more residual undersized groups than an alternative ordering would.

This is a known limitation of greedy repair and may be acceptable for the current goals, but it is worth documenting with a comment above the sort so future maintainers understand why this ordering was chosen.

How can I resolve this? If you propose a fix, please make it concise.


merged = False
for source_idx in undersized_group_indices:
target_idx = _best_graph_merge_target(
repaired_groups,
source_idx,
settings=settings,
)
if target_idx is None:
continue
repaired_groups = _merge_group_units(repaired_groups, source_idx, target_idx)
merged = True
break

if not merged:
return repaired_groups


def _group_units_cp_sat(
units: list[PartitionUnit], *, settings: Settings
) -> list[list[PartitionUnit]]:
Expand Down Expand Up @@ -392,6 +533,7 @@ def partition_diff(parsed_diff: ParsedDiff, settings: Settings) -> list[Group]:
match settings.partition_strategy:
case PartitionStrategy.GRAPH:
grouped_units = _group_units_graph(units, settings=settings)
grouped_units = _repair_graph_min_loc(grouped_units, settings=settings)
case PartitionStrategy.CP_SAT:
grouped_units = _group_units_cp_sat(units, settings=settings)
case _:
Expand Down
37 changes: 35 additions & 2 deletions tests/test_partitioning_extensive.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,15 @@ def _settings(
monkeypatch: pytest.MonkeyPatch,
*,
max_loc: int,
min_loc: int | None = None,
partition_strategy: PartitionStrategy,
priority: Priority = Priority.ORTHOGONAL,
) -> Settings:
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
return Settings(
max_loc=max_loc,
min_loc=min_loc,
partition_strategy=partition_strategy,
priority=priority,
)
Expand All @@ -104,13 +106,32 @@ def _group_signature(groups: list[Group]) -> tuple[tuple[tuple[str, tuple[int, .
return tuple(normalized)


def _assert_valid_plan(groups: list[Group], diff_text: str, max_loc: int) -> None:
def _assert_valid_plan(
groups: list[Group],
diff_text: str,
max_loc: int,
min_loc: int | None = None,
) -> None:
parsed = parse_diff(diff_text)
warnings = validate_plan(groups, parsed, PlanDAG(groups), max_loc)
warnings = validate_plan(groups, parsed, PlanDAG(groups), max_loc, min_loc=min_loc)
assert warnings == []


class TestPartitionDiffGraphExtensive:
def test_min_loc_merges_undersized_groups_when_possible(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
settings = _settings(
monkeypatch,
max_loc=10,
min_loc=5,
partition_strategy=PartitionStrategy.GRAPH,
priority=Priority.ORTHOGONAL,
)
groups = partition_diff(parse_diff(UNRELATED_DIFF), settings)
assert len(groups) == 1
_assert_valid_plan(groups, UNRELATED_DIFF, 10, min_loc=5)
Comment on lines +121 to +133
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 No test coverage for permanently-blocked repair pass

There is no test for the case where every possible merge for an undersized group is blocked — either because every pair would exceed max_loc, or because contiguity constraints rule out all candidates. In that scenario _repair_graph_min_loc returns groups that still violate min_loc, which is valid and expected behaviour, but it is currently untested.

A suggested fixture: two files each with 6 LOC (max_loc=10, min_loc=7). Each file forms a single group of 6 LOC (below min_loc=7), but merging them would yield 12 LOC which exceeds max_loc=10. The repair pass must leave both groups as-is. Adding this as a test (and asserting len(groups) == 2) would confirm the graceful degradation path.

Prompt To Fix With AI
This is a comment left during a code review.
Path: tests/test_partitioning_extensive.py
Line: 121-133

Comment:
**No test coverage for permanently-blocked repair pass**

There is no test for the case where every possible merge for an undersized group is blocked — either because every pair would exceed `max_loc`, or because contiguity constraints rule out all candidates. In that scenario `_repair_graph_min_loc` returns groups that still violate `min_loc`, which is valid and expected behaviour, but it is currently untested.

A suggested fixture: two files each with 6 LOC (`max_loc=10`, `min_loc=7`). Each file forms a single group of 6 LOC (below `min_loc=7`), but merging them would yield 12 LOC which exceeds `max_loc=10`. The repair pass must leave both groups as-is. Adding this as a test (and asserting `len(groups) == 2`) would confirm the graceful degradation path.

How can I resolve this? If you propose a fix, please make it concise.


def test_orthogonal_keeps_unrelated_files_separate(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
Expand Down Expand Up @@ -163,6 +184,18 @@ def test_is_deterministic_across_runs(self, monkeypatch: pytest.MonkeyPatch) ->
signatures = {_group_signature(partition_diff(parsed, settings)) for _ in range(3)}
assert len(signatures) == 1

def test_min_loc_repair_is_deterministic(self, monkeypatch: pytest.MonkeyPatch) -> None:
settings = _settings(
monkeypatch,
max_loc=10,
min_loc=5,
partition_strategy=PartitionStrategy.GRAPH,
priority=Priority.ORTHOGONAL,
)
parsed = parse_diff(UNRELATED_DIFF)
signatures = {_group_signature(partition_diff(parsed, settings)) for _ in range(3)}
assert len(signatures) == 1
Comment on lines +187 to +197
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Duplicate test scenario reduces determinism-test value

test_min_loc_repair_is_deterministic uses the exact same UNRELATED_DIFF, max_loc=10, min_loc=5, and ORTHOGONAL priority as test_min_loc_merges_undersized_groups_when_possible. Because UNRELATED_DIFF produces exactly two groups with a single forced merge path, the determinism test doesn't exercise any branching in _best_graph_merge_target. A truly meaningful determinism test would use a diff with three or more groups so there are multiple valid merge candidates and the tiebreaking logic is actually exercised.

Consider replacing UNRELATED_DIFF here with a multi-group fixture, for example a diff with three files each adding 3 LOC (max_loc=10, min_loc=5), which would produce three undersized groups and have at least two possible merge paths to verify against.

Prompt To Fix With AI
This is a comment left during a code review.
Path: tests/test_partitioning_extensive.py
Line: 187-197

Comment:
**Duplicate test scenario reduces determinism-test value**

`test_min_loc_repair_is_deterministic` uses the exact same `UNRELATED_DIFF`, `max_loc=10`, `min_loc=5`, and `ORTHOGONAL` priority as `test_min_loc_merges_undersized_groups_when_possible`. Because `UNRELATED_DIFF` produces exactly two groups with a single forced merge path, the determinism test doesn't exercise any branching in `_best_graph_merge_target`. A truly meaningful determinism test would use a diff with three or more groups so there are multiple valid merge candidates and the tiebreaking logic is actually exercised.

Consider replacing `UNRELATED_DIFF` here with a multi-group fixture, for example a diff with three files each adding 3 LOC (`max_loc=10`, `min_loc=5`), which would produce three undersized groups and have at least two possible merge paths to verify against.

How can I resolve this? If you propose a fix, please make it concise.



class TestPartitionDiffCpSatExtensive:
def test_missing_ortools_raises_clean_error(self, monkeypatch: pytest.MonkeyPatch) -> None:
Expand Down
Loading