Skip to content

Commit

Permalink
fix: rely again on iso-8859-1 instead of utf8
Browse files Browse the repository at this point in the history
the OpenGPT-X data seems to come with problematic chars, which cannot get edecoded via utf8.
The former fix to use iso-8859-1 fixes this. However the issue probably lays actually with dataset conversions
  • Loading branch information
luzian-hahn committed Jan 23, 2024
1 parent ea2e2f9 commit c9e4e08
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 8 deletions.
2 changes: 2 additions & 0 deletions src/modalities/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Not relying on "utf8" after encountering encoding issues when using OpenGPT-X Data.
DEFAULT_ENCODING = "iso-8859-1"
17 changes: 11 additions & 6 deletions src/modalities/dataloader/create_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@
import numpy as np
from tqdm import tqdm

from modalities.constants import DEFAULT_ENCODING


# TODO: benchmark against pyspark
class IndexGenerator:
def __init__(self, src_file: Path, chunksize: int = 4096, drop_faulty_entries: bool = False):
"""
Reads in a JSON file as a binary file, iterates character by character und builds up
the sample index (char-wisestart and end position for each JSON sample) via "\n" character positions.
the sample index (char-wise start and end position for each JSON sample) via "\n" character positions.
:param src_file: Path to a jsonl-file.
:param chunksize: defines the size of byte chunks that are processed via a producer-consumer approach.
Expand Down Expand Up @@ -59,16 +61,19 @@ def queue_generator():
def process_line(last_index: int, curr_index: int):
segment_len = curr_index - last_index
try: # check if line is a valid json
line = np.memmap(self.src_file, mode="r", offset=last_index, shape=(segment_len,)).view("S1").tolist()
line = [c.decode("utf8") for c in line]
line = "".join(line)
json.loads(line)
memmapped_line = (
np.memmap(self.src_file, mode="r", offset=last_index, shape=(segment_len,)).view("S1").tolist()
)
decoded_chars_in_line = [c.decode(DEFAULT_ENCODING) for c in memmapped_line]
decoded_line = "".join(decoded_chars_in_line)
json.loads(decoded_line)
self._index_map.append((last_index, segment_len))
except Exception as low_level_err:
if self.drop_faulty_entries:
warnings.warn(f"faulty line at {last_index}-{curr_index}, skipping...")
else:
warnings.warn(f"faulty line: {line=}")
concatenated_line_for_debugging = b"".join(memmapped_line)
warnings.warn(f"faulty line: {concatenated_line_for_debugging}")
err = ValueError(f"faulty line at {last_index}-{curr_index}")
err.__cause__ = low_level_err
self._exception_buffer.append(err)
Expand Down
2 changes: 1 addition & 1 deletion src/modalities/dataloader/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def generate_megatron_index(self) -> List[Tuple[int, int]]:
curr_len = 0
block_size_in_bytes = self.block_size * self.INT_SIZE_IN_BYTES
for segment_offset, segment_len in tqdm(self.index_base):
# When the sum of of the length of the current previously seen samples doesn't
# When the sum of the length of the current previously seen samples doesn't
# exceed block_size_in_bytes, we add the current segment length to the previous
# ones and continue.
if curr_len + segment_len < block_size_in_bytes:
Expand Down
4 changes: 3 additions & 1 deletion src/modalities/dataloader/large_file_lines_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import numpy as np

from modalities.constants import DEFAULT_ENCODING


class BaseReader(ABC):
@abstractmethod
Expand Down Expand Up @@ -60,7 +62,7 @@ def safe_decoder(byte_char):
try:
# TODO: verify why iso-8859-1 was necessary here in the path.
# Maybe there was an issue with the actual loading of the jsonl-files
c = byte_char.decode("utf8")
c = byte_char.decode(DEFAULT_ENCODING)
except Exception as exception:
c = ""
warnings.warn(f'Encountered invalid char: "{byte_char}".')
Expand Down

0 comments on commit c9e4e08

Please sign in to comment.