Skip to content

Commit 12566ee

Browse files
authored
Merge pull request #45 from olirice/or/full_join
Fully outer join support
2 parents b264844 + bf8ab39 commit 12566ee

File tree

5 files changed

+96
-2
lines changed

5 files changed

+96
-2
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,6 @@ venv/*
6868

6969
# Eclipse
7070
.settings
71+
72+
# LLMs
73+
CLAUDE.md

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ repos:
2828
language_version: python3.9
2929

3030
- repo: https://github.com/pre-commit/mirrors-mypy
31-
rev: v1.15.0
31+
rev: v1.17.0
3232
hooks:
3333
- id: mypy
3434
files: flupy/

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "flupy"
3-
version = "1.2.2"
3+
version = "1.2.3"
44
description = "Fluent data processing in Python - a chainable stream processing library for expressive data manipulation using method chaining"
55
authors = ["Oliver Rice <[email protected]>"]
66
license = "MIT"

src/flupy/fluent.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
Callable,
1111
Collection,
1212
Deque,
13+
Dict,
1314
Generator,
1415
Generic,
1516
Hashable,
@@ -314,6 +315,56 @@ def _impl() -> Generator[Tuple[T, _T1], None, None]:
314315

315316
return Fluent(_impl())
316317

318+
def join_full(
319+
self,
320+
other: Iterable[_T1],
321+
key: Callable[[T], Hashable] = identity,
322+
other_key: Callable[[_T1], Hashable] = identity,
323+
) -> "Fluent[Tuple[Union[T, None], Union[_T1, None]]]":
324+
"""Join the iterable with another iterable using equality between *key* applied to self and *other_key* applied to *other* to identify matching entries
325+
326+
Returns all entries from both iterables. When no matching entry is found, entries are paired with None
327+
328+
Note: join_full loads both *self* and *other* into memory
329+
330+
>>> flu(range(4)).join_full(range(2, 6)).to_list()
331+
[(0, None), (1, None), (2, 2), (3, 3), (None, 4), (None, 5)]
332+
"""
333+
334+
def _impl() -> Generator[Tuple[Union[T, None], Union[_T1, None]], None, None]:
335+
336+
# Build lookup for other
337+
other_lookup: Dict[Hashable, List[_T1]] = defaultdict(list)
338+
other_keys_seen: Set[Hashable] = set()
339+
340+
for entry_other in other:
341+
other_key_val = other_key(entry_other)
342+
other_lookup[other_key_val].append(entry_other)
343+
other_keys_seen.add(other_key_val)
344+
345+
# Track which keys from other have been matched
346+
matched_other_keys: Set[Hashable] = set()
347+
348+
# Process all entries from self
349+
for entry in self:
350+
entry_key = key(entry)
351+
matches: Optional[List[_T1]] = other_lookup.get(entry_key)
352+
353+
if matches:
354+
matched_other_keys.add(entry_key)
355+
for match in matches:
356+
yield (entry, match)
357+
else:
358+
yield (entry, None)
359+
360+
# Yield unmatched entries from other
361+
unmatched_keys = other_keys_seen - matched_other_keys
362+
for unmatched_key in unmatched_keys:
363+
for entry_other in other_lookup[unmatched_key]:
364+
yield (None, entry_other)
365+
366+
return Fluent(_impl())
367+
317368
def shuffle(self) -> "Fluent[T]":
318369
"""Randomize the order of elements in the interable
319370

src/tests/test_flu.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,3 +387,43 @@ def test_join_inner():
387387
# Default unpacking
388388
res = flu(range(6)).join_inner(range(0, 6, 2)).collect()
389389
assert res == [(0, 0), (2, 2), (4, 4)]
390+
391+
392+
def test_join_full():
393+
# Basic full join
394+
res = flu(range(4)).join_full(range(2, 6)).collect()
395+
assert res == [(0, None), (1, None), (2, 2), (3, 3), (None, 4), (None, 5)]
396+
397+
# Full join with custom keys
398+
left = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}]
399+
right = [{"id": 2, "value": 100}, {"id": 3, "value": 200}]
400+
res = flu(left).join_full(right, key=lambda x: x["id"], other_key=lambda x: x["id"]).collect()
401+
assert res == [
402+
({"id": 1, "name": "Alice"}, None),
403+
({"id": 2, "name": "Bob"}, {"id": 2, "value": 100}),
404+
(None, {"id": 3, "value": 200}),
405+
]
406+
407+
# Full join with empty left
408+
res = flu([]).join_full(range(3)).collect()
409+
assert res == [(None, 0), (None, 1), (None, 2)]
410+
411+
# Full join with empty right
412+
res = flu(range(3)).join_full([]).collect()
413+
assert res == [(0, None), (1, None), (2, None)]
414+
415+
# Full join with both empty
416+
res = flu([]).join_full([]).collect()
417+
assert res == []
418+
419+
# Full join with duplicates
420+
res = flu([1, 2, 2, 3]).join_full([2, 2, 4]).collect()
421+
expected = [(1, None), (2, 2), (2, 2), (2, 2), (2, 2), (3, None), (None, 4)] # 2x2 cartesian product
422+
# Sort with custom key to handle None values
423+
sort_key = lambda x: (
424+
x[0] is None,
425+
x[0] if x[0] is not None else -1,
426+
x[1] is None,
427+
x[1] if x[1] is not None else -1,
428+
)
429+
assert sorted(res, key=sort_key) == sorted(expected, key=sort_key)

0 commit comments

Comments
 (0)