Skip to content

Commit

Permalink
Do not try integer packing for huge values
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Oct 23, 2024
1 parent 53256ed commit c039c3e
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
31 changes: 31 additions & 0 deletions src/biotite/structure/io/pdbx/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ def _find_best_integer_compression(array):
array_after_rle = array_after_delta
for packed_byte_count in [None, 1, 2]:
if packed_byte_count is not None:
# Quickly check this heuristic
# to avoid computing an exploding packed data array
if (
_estimate_packed_length(array_after_rle, packed_byte_count)
>= array_after_rle.nbytes
):
# Packing would not reduce the size
continue
encoding = IntegerPackingEncoding(packed_byte_count)
array_after_packing = encoding.encode(array_after_rle)
encodings_after_packing = encodings_after_rle + [encoding]
Expand All @@ -194,6 +202,29 @@ def _find_best_integer_compression(array):
return best_encoding_sequence, smallest_size


def _estimate_packed_length(array, packed_byte_count):
"""
Estimate the length of an integer array after packing it with a given number of
bytes.
Parameters
----------
array : numpy.ndarray
The array to pack.
packed_byte_count : int
The number of bytes used for packing.
Returns
-------
length : int
The estimated length of the packed array.
"""
# Use int64 to avoid integer overflow in the following line
max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
return np.sum(n_bytes_per_element, dtype=np.int64)


def _to_smallest_integer_type(array):
"""
Convert an integer array to the smallest possible integer type, that is still able
Expand Down
2 changes: 1 addition & 1 deletion src/biotite/structure/io/pdbx/encoding.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ class IntegerPackingEncoding(Encoding):
# Get length of output array
# by summing up required length of each element
cdef int number
cdef int length = 0
cdef long length = 0
for i in range(data.shape[0]):
number = data[i]
if number < 0:
Expand Down

0 comments on commit c039c3e

Please sign in to comment.