Skip to content

Commit

Permalink
Merge pull request #683 from padix-key/compress
Browse files Browse the repository at this point in the history
Do not try integer packing for huge values
  • Loading branch information
padix-key authored Oct 25, 2024
2 parents 53256ed + 1edfc0c commit 54cd5e5
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 3 deletions.
8 changes: 6 additions & 2 deletions src/biotite/application/viennarna/rnaplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,12 @@ def run(self):
self._in_file.write(self._dot_bracket)
self._in_file.flush()
self.set_arguments(
["-i", self._in_file.name, "-o", "xrna", "-t", self._layout_type]
)
[
"-i", self._in_file.name,
"--output-format", "xrna",
"-t", self._layout_type,
]
) # fmt: skip
super().run()

def evaluate(self):
Expand Down
31 changes: 31 additions & 0 deletions src/biotite/structure/io/pdbx/compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ def _find_best_integer_compression(array):
array_after_rle = array_after_delta
for packed_byte_count in [None, 1, 2]:
if packed_byte_count is not None:
# Quickly check this heuristic
# to avoid computing an exploding packed data array
if (
_estimate_packed_length(array_after_rle, packed_byte_count)
>= array_after_rle.nbytes
):
# Packing would not reduce the size
continue
encoding = IntegerPackingEncoding(packed_byte_count)
array_after_packing = encoding.encode(array_after_rle)
encodings_after_packing = encodings_after_rle + [encoding]
Expand All @@ -194,6 +202,29 @@ def _find_best_integer_compression(array):
return best_encoding_sequence, smallest_size


def _estimate_packed_length(array, packed_byte_count):
"""
Estimate the length of an integer array after packing it with a given number of
bytes.
Parameters
----------
array : numpy.ndarray
The array to pack.
packed_byte_count : int
The number of bytes used for packing.
Returns
-------
length : int
The estimated length of the packed array.
"""
# Use int64 to avoid integer overflow in the following line
max_val_per_element = np.int64(2 ** (8 * packed_byte_count))
n_bytes_per_element = packed_byte_count * (np.abs(array // max_val_per_element) + 1)
return np.sum(n_bytes_per_element, dtype=np.int64)


def _to_smallest_integer_type(array):
"""
Convert an integer array to the smallest possible integer type, that is still able
Expand Down
2 changes: 1 addition & 1 deletion src/biotite/structure/io/pdbx/encoding.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ class IntegerPackingEncoding(Encoding):
# Get length of output array
# by summing up required length of each element
cdef int number
cdef int length = 0
cdef long length = 0
for i in range(data.shape[0]):
number = data[i]
if number < 0:
Expand Down

0 comments on commit 54cd5e5

Please sign in to comment.