Skip to content

Commit

Permalink
Numpy-based RLE compression for bitmasks (#809)
Browse files Browse the repository at this point in the history
* Speed up bitmask operations
  • Loading branch information
sergei-encord authored Dec 2, 2024
1 parent 8a90571 commit 1bab699
Show file tree
Hide file tree
Showing 7 changed files with 255 additions and 125 deletions.
20 changes: 20 additions & 0 deletions encord/common/bitmask_operations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
try:
from encord.common.bitmask_operations.bitmask_operations_numpy import (
_mask_to_rle,
_rle_to_mask,
_rle_to_string,
_string_to_rle,
deserialise_bitmask,
serialise_bitmask,
transpose_bytearray,
)
except ImportError:
from encord.common.bitmask_operations.bitmask_operations import (
_mask_to_rle,
_rle_to_mask,
_rle_to_string,
_string_to_rle,
deserialise_bitmask,
serialise_bitmask,
transpose_bytearray,
)
104 changes: 104 additions & 0 deletions encord/common/bitmask_operations/bitmask_operations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from itertools import groupby
from typing import List, Sequence, Tuple


def _string_to_rle(mask_string: str) -> List[int]:
"""
COCO-compatible string to RLE-encoded mask de-serialisation
"""
cnts: List[int] = []
p = 0

while p < len(mask_string):
x = 0
k = 0
more = 1

while more and p < len(mask_string):
c = ord(mask_string[p]) - 48
x |= (c & 0x1F) << (5 * k)
more = c & 0x20
p += 1
k += 1

if not more and (c & 0x10):
x |= -1 << (5 * k)

if len(cnts) > 2:
x += cnts[-2]

cnts.append(x)

return cnts


def _rle_to_string(rle: Sequence[int]) -> str:
"""
COCO-compatible RLE-encoded mask to string serialisation
"""
rle_string = ""
for i, x in enumerate(rle):
if i > 2:
x -= rle[i - 2]

more = 1
while more:
c = x & 0x1F
x >>= 5

if c & 0x10:
more = x != -1
else:
more = x != 0

if more:
c |= 0x20

c += 48
rle_string += chr(c)

return rle_string


def _mask_to_rle(mask: bytes) -> List[int]:
"""
COCO-compatible raw bitmask to COCO-compatible RLE
"""
return [len(list(group)) for _, group in groupby(mask)]


def _rle_to_mask(rle: List[int], size: int) -> bytes:
"""
COCO-compatible RLE to bitmask
"""
res = bytearray(size)
offset = 0

for i, c in enumerate(rle):
v = i % 2
while c > 0:
res[offset] = v
offset += 1
c -= 1

return bytes(res)


def serialise_bitmask(bitmask: bytes) -> str:
rle = _mask_to_rle(bitmask)
return _rle_to_string(rle)


def deserialise_bitmask(serialised_bitmask: str, length: int) -> bytes:
rle = _string_to_rle(serialised_bitmask)
return _rle_to_mask(rle, length)


def transpose_bytearray(byte_data: bytes, shape: Tuple[int, int]) -> bytes:
rows, cols = shape
transposed_byte_data = bytearray(len(byte_data))
for row in range(rows):
for col in range(cols):
transposed_byte_data[col * rows + row] = byte_data[row * cols + col]

return transposed_byte_data
32 changes: 32 additions & 0 deletions encord/common/bitmask_operations/bitmask_operations_numpy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List, Sequence, Tuple

import numpy as np

# Importing python implementations of functions that not have numpy implementation
from .bitmask_operations import _rle_to_mask, _rle_to_string, _string_to_rle


def _mask_to_rle(mask: bytes) -> List[int]:
"""
COCO-compatible raw bitmask to COCO-compatible RLE
"""
mask_buffer = np.frombuffer(mask, dtype=np.bool_)
changes = np.diff(mask_buffer, prepend=mask_buffer[0], append=mask_buffer[-1])
change_indices = np.flatnonzero(changes != 0)
run_lengths = np.diff(np.concatenate(([0], change_indices, [len(mask_buffer)])))
return run_lengths.tolist()


def serialise_bitmask(bitmask: bytes) -> str:
rle = _mask_to_rle(bitmask)
return _rle_to_string(rle)


def deserialise_bitmask(serialised_bitmask: str, length: int) -> bytes:
rle = _string_to_rle(serialised_bitmask)
return _rle_to_mask(rle, length)


def transpose_bytearray(byte_data: bytes, shape: Tuple[int, int]) -> bytes:
np_byte_data = np.frombuffer(byte_data, dtype=np.int8).reshape(shape)
return bytearray(np_byte_data.T.tobytes())
128 changes: 6 additions & 122 deletions encord/objects/bitmask.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@

from __future__ import annotations

from typing import Any, Dict, List, Optional, Protocol, Sequence, Tuple, Union, runtime_checkable
from typing import Any, Dict, Optional, Protocol, Union, runtime_checkable

from encord.common.bitmask_operations import deserialise_bitmask, serialise_bitmask
from encord.exceptions import EncordException
from encord.orm.base_dto import BaseDTO

Expand All @@ -31,123 +32,6 @@ def __array_interface__(self) -> Dict[str, Any]: ...
def tobytes(self) -> bytes: ...


def _string_to_rle(s: str) -> List[int]:
"""
COCO-compatible string to RLE-encoded mask de-serialisation
"""
cnts: List[int] = []
p = 0

while p < len(s):
x = 0
k = 0
more = 1

while more and p < len(s):
c = ord(s[p]) - 48
x |= (c & 0x1F) << (5 * k)
more = c & 0x20
p += 1
k += 1

if not more and (c & 0x10):
x |= -1 << (5 * k)

if len(cnts) > 2:
x += cnts[-2]

cnts.append(x)

return cnts


def _rle_to_string(rle: Sequence[int]) -> str:
"""
COCO-compatible RLE-encoded mask to string serialisation
"""
rle_string = ""
for i, x in enumerate(rle):
if i > 2:
x -= rle[i - 2]

more = 1
while more:
c = x & 0x1F
x >>= 5

if c & 0x10:
more = x != -1
else:
more = x != 0

if more:
c |= 0x20

c += 48
rle_string += chr(c)

return rle_string


def _rle_to_mask(rle: List[int], size: int) -> bytes:
"""
COCO-compatible RLE to bitmask
"""
res = bytearray(size)
offset = 0

for i, c in enumerate(rle):
v = i % 2
while c > 0:
res[offset] = v
offset += 1
c -= 1

return bytes(res)


def _mask_to_rle(mask: bytes) -> List[int]:
"""
COCO-compatible raw bitmask to COCO-compatible RLE
"""
rle_counts = []
c = 0
p = 0
for mask_value in mask:
if mask_value != p:
rle_counts.append(c)
c = 0
p = mask_value
c += 1

rle_counts.append(c)
return rle_counts


def transpose_bytearray(byte_data: bytes, shape: Tuple[int, int]) -> bytearray:
"""
Transpose a 2D array represented by bytes.
"""
np_found = True
try:
import numpy as np
except ImportError:
np_found = False
if not np_found:
rows, cols = shape
# Create a new bytearray to hold the transposed data
transposed_byte_data = bytearray(len(byte_data))
# Transpose the 2D array
for row in range(rows):
for col in range(cols):
transposed_byte_data[col * rows + row] = byte_data[row * cols + col]
else:
np_byte_data = np.frombuffer(byte_data, dtype=np.int8).reshape(shape)
transposed_byte_data = bytearray(np_byte_data.T.tobytes())

return transposed_byte_data


class BitmaskCoordinates:
class EncodedBitmask(BaseDTO):
top: int
Expand Down Expand Up @@ -212,8 +96,7 @@ def _from_array(source: ArrayProtocol) -> BitmaskCoordinates.EncodedBitmask:

raw_data = data if isinstance(data, bytes) else source.tobytes()

rle = _mask_to_rle(raw_data)
rle_string = _rle_to_string(rle)
rle_string = serialise_bitmask(raw_data)

return BitmaskCoordinates.EncodedBitmask(top=0, left=0, height=shape[0], width=shape[1], rle_string=rle_string)

Expand Down Expand Up @@ -241,8 +124,9 @@ def to_numpy_array(self):

@property
def __array_interface__(self):
rle = _string_to_rle(self._encoded_bitmask.rle_string)
data = _rle_to_mask(rle, self._encoded_bitmask.height * self._encoded_bitmask.width)
data = deserialise_bitmask(
self._encoded_bitmask.rle_string, self._encoded_bitmask.height * self._encoded_bitmask.width
)
return {
"version": 3,
"data": data,
Expand Down
2 changes: 1 addition & 1 deletion encord/utilities/coco/datastructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from datetime import datetime
from typing import Any, List, NamedTuple, Optional, Union

from encord.objects.bitmask import (
from encord.common.bitmask_operations import (
_mask_to_rle,
_rle_to_mask,
_rle_to_string,
Expand Down
6 changes: 4 additions & 2 deletions tests/objects/test_bitmask.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import numpy as np

from encord.objects.bitmask import (
BitmaskCoordinates,
from encord.common.bitmask_operations import (
_mask_to_rle,
_rle_to_mask,
_rle_to_string,
_string_to_rle,
transpose_bytearray,
)
from encord.objects.bitmask import (
BitmaskCoordinates,
)


def test_rle_decode():
Expand Down
Loading

0 comments on commit 1bab699

Please sign in to comment.