Skip to content

Commit 57afd0c

Browse files
committed
Fix: handle missing/invalid values in CoordinateCompressor (fixes #13226)
1 parent c0d2b12 commit 57afd0c

File tree

3 files changed

+122
-81
lines changed

3 files changed

+122
-81
lines changed
Lines changed: 95 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,132 +1,146 @@
11
"""
2-
Assumption:
3-
- The values to compress are assumed to be comparable,
4-
values can be sorted and compared with '<' and '>' operators.
2+
Coordinate Compression Utility
3+
------------------------------
4+
5+
Fix for Issue #13226: Handles missing or invalid values (None, NaN)
6+
to ensure consistent compression behavior.
7+
8+
This module provides a `CoordinateCompressor` class that safely compresses
9+
and decompresses values from a list by mapping each unique valid value
10+
to a unique integer index.
11+
12+
Invalid or non-comparable values (like None or NaN) are ignored during
13+
compression mapping and return -1 when compressed.
514
"""
615

16+
from __future__ import annotations
17+
18+
import math
19+
from typing import Any
20+
721

822
class CoordinateCompressor:
923
"""
10-
A class for coordinate compression.
11-
12-
This class allows you to compress and decompress a list of values.
13-
14-
Mapping:
15-
In addition to compression and decompression, this class maintains a mapping
16-
between original values and their compressed counterparts using two data
17-
structures: a dictionary `coordinate_map` and a list `reverse_map`:
18-
- `coordinate_map`: A dictionary that maps original values to their compressed
19-
coordinates. Keys are original values, and values are compressed coordinates.
20-
- `reverse_map`: A list used for reverse mapping, where each index corresponds
21-
to a compressed coordinate, and the value at that index is the original value.
22-
23-
Example of mapping:
24-
Original: 10, Compressed: 0
25-
Original: 52, Compressed: 1
26-
Original: 83, Compressed: 2
27-
Original: 100, Compressed: 3
28-
29-
This mapping allows for efficient compression and decompression of values within
30-
the list.
24+
CoordinateCompressor compresses comparable values to integer ranks.
25+
26+
Example:
27+
>>> arr = [100, 10, 52, 83]
28+
>>> cc = CoordinateCompressor(arr)
29+
>>> cc.compress(100)
30+
3
31+
>>> cc.compress(52)
32+
1
33+
>>> cc.decompress(1)
34+
52
35+
>>> cc.compress(None)
36+
-1
3137
"""
3238

33-
def __init__(self, arr: list[int | float | str]) -> None:
39+
def __init__(self, arr: list[Any]) -> None:
3440
"""
3541
Initialize the CoordinateCompressor with a list.
3642
3743
Args:
38-
arr: The list of values to be compressed.
39-
40-
>>> arr = [100, 10, 52, 83]
41-
>>> cc = CoordinateCompressor(arr)
42-
>>> cc.compress(100)
43-
3
44-
>>> cc.compress(52)
45-
1
46-
>>> cc.decompress(1)
47-
52
48-
"""
49-
50-
# A dictionary to store compressed coordinates
51-
self.coordinate_map: dict[int | float | str, int] = {}
52-
53-
# A list to store reverse mapping
54-
self.reverse_map: list[int | float | str] = [-1] * len(arr)
44+
arr: The list of values to be compressed.
5545
56-
self.arr = sorted(arr) # The input list
57-
self.n = len(arr) # The length of the input list
58-
self.compress_coordinates()
46+
Invalid or missing values (None, NaN) are skipped when building
47+
the mapping, ensuring consistent compression behavior.
5948
60-
def compress_coordinates(self) -> None:
61-
"""
62-
Compress the coordinates in the input list.
63-
64-
>>> arr = [100, 10, 52, 83]
49+
>>> arr = [100, None, 52, 83, float("nan")]
6550
>>> cc = CoordinateCompressor(arr)
66-
>>> cc.coordinate_map[83]
51+
>>> cc.compress(100)
6752
2
68-
>>> cc.coordinate_map[80] # Value not in the original list
69-
Traceback (most recent call last):
70-
...
71-
KeyError: 80
72-
>>> cc.reverse_map[2]
73-
83
53+
>>> cc.compress(None)
54+
-1
55+
>>> cc.compress(float("nan"))
56+
-1
7457
"""
75-
key = 0
76-
for val in self.arr:
77-
if val not in self.coordinate_map:
78-
self.coordinate_map[val] = key
79-
self.reverse_map[key] = val
80-
key += 1
81-
82-
def compress(self, original: float | str) -> int:
58+
# Store the original list
59+
self.original = list(arr)
60+
61+
# Filter valid (comparable) values — ignore None and NaN
62+
valid_values = [
63+
x
64+
for x in arr
65+
if x is not None and not (isinstance(x, float) and math.isnan(x))
66+
]
67+
68+
# Sort and remove duplicates using dict.fromkeys for stable order
69+
unique_sorted = sorted(dict.fromkeys(valid_values))
70+
71+
# Create mappings
72+
self.coordinate_map: dict[Any, int] = {
73+
v: i for i, v in enumerate(unique_sorted)
74+
}
75+
self.reverse_map: list[Any] = unique_sorted.copy()
76+
77+
# Track invalid values (for reference, not essential)
78+
self.invalid_values: list[Any] = [
79+
x
80+
for x in arr
81+
if x is None or (isinstance(x, float) and math.isnan(x))
82+
]
83+
84+
def compress(self, original: Any) -> int:
8385
"""
84-
Compress a single value.
85-
86-
Args:
87-
original: The value to compress.
86+
Compress a single value to its coordinate index.
8887
8988
Returns:
90-
The compressed integer, or -1 if not found in the original list.
89+
int: The compressed index, or -1 if invalid or not found.
9190
9291
>>> arr = [100, 10, 52, 83]
9392
>>> cc = CoordinateCompressor(arr)
94-
>>> cc.compress(100)
95-
3
96-
>>> cc.compress(7) # Value not in the original list
93+
>>> cc.compress(10)
94+
0
95+
>>> cc.compress(7)
96+
-1
97+
>>> cc.compress(None)
9798
-1
9899
"""
100+
# Handle invalid or missing values
101+
if original is None:
102+
return -1
103+
if isinstance(original, float) and math.isnan(original):
104+
return -1
99105
return self.coordinate_map.get(original, -1)
100106

101-
def decompress(self, num: int) -> int | float | str:
107+
def decompress(self, num: int) -> Any:
102108
"""
103-
Decompress a single integer.
109+
Decompress an integer coordinate back to its original value.
104110
105111
Args:
106-
num: The compressed integer to decompress.
112+
num: Compressed index to decompress.
107113
108114
Returns:
109-
The original value.
115+
The original value for valid indices, otherwise -1.
110116
111117
>>> arr = [100, 10, 52, 83]
112118
>>> cc = CoordinateCompressor(arr)
113119
>>> cc.decompress(0)
114120
10
115-
>>> cc.decompress(5) # Compressed coordinate out of range
121+
>>> cc.decompress(5)
116122
-1
117123
"""
118-
return self.reverse_map[num] if 0 <= num < len(self.reverse_map) else -1
124+
if 0 <= num < len(self.reverse_map):
125+
return self.reverse_map[num]
126+
return -1
119127

120128

121129
if __name__ == "__main__":
122130
from doctest import testmod
123131

124132
testmod()
125133

126-
arr: list[int | float | str] = [100, 10, 52, 83]
134+
arr: list[Any] = [100, 10, 52, 83, None, float("nan")]
127135
cc = CoordinateCompressor(arr)
128136

137+
print("Coordinate Compression Demo:\n")
129138
for original in arr:
130139
compressed = cc.compress(original)
131140
decompressed = cc.decompress(compressed)
132-
print(f"Original: {decompressed}, Compressed: {compressed}")
141+
print(
142+
f"Original: {original!r:>6} | "
143+
f"Compressed: {compressed:>2} | "
144+
f"Decompressed: {decompressed!r}"
145+
)
146+

tests/data_compression/__init__.py

Whitespace-only changes.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from data_compression.coordinate_compression import CoordinateCompressor
2+
import math
3+
4+
5+
def test_basic_compression():
6+
arr = [100, 10, 52, 83]
7+
cc = CoordinateCompressor(arr)
8+
assert cc.compress(10) == 0
9+
assert cc.compress(83) == 2 or cc.compress(83) == 3
10+
assert cc.decompress(0) == 10
11+
12+
13+
def test_with_none_and_nan():
14+
arr = [100, None, 52, 83, float("nan")]
15+
cc = CoordinateCompressor(arr)
16+
assert cc.compress(None) == -1
17+
assert cc.compress(float("nan")) == -1
18+
assert cc.compress(52) != -1
19+
assert cc.decompress(5) == -1
20+
21+
22+
def test_duplicate_values():
23+
arr = [10, 10, 10]
24+
cc = CoordinateCompressor(arr)
25+
assert cc.compress(10) == 0
26+
assert cc.decompress(0) == 10
27+

0 commit comments

Comments
 (0)