fix: rely again on iso-8859-1 instead of utf8

the OpenGPT-X data seems to come with problematic chars, which cannot get edecoded via utf8. The former fix to use iso-8859-1 fixes this. However the issue probably lays actually with dataset conversions
Modalities · Jan 23, 2024 · c9e4e08 · c9e4e08
1 parent ea2e2f9
commit c9e4e08
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 8 deletions.
diff --git a/src/modalities/constants.py b/src/modalities/constants.py
@@ -0,0 +1,2 @@
+# Not relying on "utf8" after encountering encoding issues when using OpenGPT-X Data.
+DEFAULT_ENCODING = "iso-8859-1"
diff --git a/src/modalities/dataloader/create_index.py b/src/modalities/dataloader/create_index.py
@@ -9,13 +9,15 @@
 import numpy as np
 from tqdm import tqdm
 
+from modalities.constants import DEFAULT_ENCODING
+
 
 # TODO: benchmark against pyspark
 class IndexGenerator:
     def __init__(self, src_file: Path, chunksize: int = 4096, drop_faulty_entries: bool = False):
         """
         Reads in a JSON file as a binary file, iterates character by character und builds up
-        the sample index (char-wisestart and end position for each JSON sample) via "\n" character positions.
+        the sample index (char-wise start and end position for each JSON sample) via "\n" character positions.
 
         :param src_file: Path to a jsonl-file.
         :param chunksize: defines the size of byte chunks that are processed via a producer-consumer approach.
@@ -59,16 +61,19 @@ def queue_generator():
         def process_line(last_index: int, curr_index: int):
             segment_len = curr_index - last_index
             try:  # check if line is a valid json
-                line = np.memmap(self.src_file, mode="r", offset=last_index, shape=(segment_len,)).view("S1").tolist()
-                line = [c.decode("utf8") for c in line]
-                line = "".join(line)
-                json.loads(line)
+                memmapped_line = (
+                    np.memmap(self.src_file, mode="r", offset=last_index, shape=(segment_len,)).view("S1").tolist()
+                )
+                decoded_chars_in_line = [c.decode(DEFAULT_ENCODING) for c in memmapped_line]
+                decoded_line = "".join(decoded_chars_in_line)
+                json.loads(decoded_line)
                 self._index_map.append((last_index, segment_len))
             except Exception as low_level_err:
                 if self.drop_faulty_entries:
                     warnings.warn(f"faulty line at {last_index}-{curr_index}, skipping...")
                 else:
-                    warnings.warn(f"faulty line: {line=}")
+                    concatenated_line_for_debugging = b"".join(memmapped_line)
+                    warnings.warn(f"faulty line: {concatenated_line_for_debugging}")
                     err = ValueError(f"faulty line at {last_index}-{curr_index}")
                     err.__cause__ = low_level_err
                     self._exception_buffer.append(err)

diff --git a/src/modalities/dataloader/dataset.py b/src/modalities/dataloader/dataset.py
@@ -161,7 +161,7 @@ def generate_megatron_index(self) -> List[Tuple[int, int]]:
         curr_len = 0
         block_size_in_bytes = self.block_size * self.INT_SIZE_IN_BYTES
         for segment_offset, segment_len in tqdm(self.index_base):
-            # When the sum of of the length of the current previously seen samples doesn't
+            # When the sum of the length of the current previously seen samples doesn't
             # exceed block_size_in_bytes, we add the current segment length to the previous
             # ones and continue.
             if curr_len + segment_len < block_size_in_bytes:

diff --git a/src/modalities/dataloader/large_file_lines_reader.py b/src/modalities/dataloader/large_file_lines_reader.py
@@ -6,6 +6,8 @@
 
 import numpy as np
 
+from modalities.constants import DEFAULT_ENCODING
+
 
 class BaseReader(ABC):
     @abstractmethod
@@ -60,7 +62,7 @@ def safe_decoder(byte_char):
             try:
                 # TODO: verify why iso-8859-1 was necessary here in the path.
                 #   Maybe there was an issue with the actual loading of the jsonl-files
-                c = byte_char.decode("utf8")
+                c = byte_char.decode(DEFAULT_ENCODING)
             except Exception as exception:
                 c = ""
                 warnings.warn(f'Encountered invalid char: "{byte_char}".')