Merge Sections Spanning Multiple Lines in CSV Reader (#88)

Hochfrequenz · May 4, 2022 · 64805e9 · 64805e9
1 parent fa932ed
commit 64805e9
Show file tree

Hide file tree

Showing 3 changed files with 114 additions and 7 deletions.
diff --git a/src/maus/reader/flat_ahb_reader.py b/src/maus/reader/flat_ahb_reader.py
@@ -7,7 +7,7 @@
 import uuid
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import List, Optional, Sequence, TextIO, Tuple
+from typing import List, Optional, Sequence, Set, TextIO, Tuple
 
 from maus.models.anwendungshandbuch import AhbLine, AhbMetaInformation, FlatAnwendungshandbuch
 from maus.models.edifact_components import gabi_edifact_qualifier_pattern
@@ -47,11 +47,77 @@ def __init__(self, file_path: Path, pruefidentifikator: Optional[str] = None, en
         self.delimiter = delimiter
         with open(file_path, "r", encoding=encoding) as infile:
             # current_section_name: Optional[str]
-            for row in self.get_raw_rows(infile):
-                ahb_line = self.raw_ahb_row_to_ahbline(row)
-                if ahb_line is None:
-                    continue
-                self.rows.append(ahb_line)
+            raw_lines = self.get_raw_rows(infile)
+        raw_lines_with_merged_section_names = FlatAhbCsvReader.merge_section_only_lines(raw_lines)
+        for row in raw_lines_with_merged_section_names:
+            ahb_line = self.raw_ahb_row_to_ahbline(row)
+            if ahb_line is None:
+                continue
+            self.rows.append(ahb_line)
+
+    @staticmethod
+    def merge_section_only_lines(raw_lines: List[dict]) -> List[dict]:
+        """
+        merges adjacent lines from the CSV source when they only contain an AHB "section" description.
+        "Section" headings are the grey lines on the left of the AHB PDF.
+        (The first section of each AHB is "Nachrichten-Kopfsegment" in most cases.)
+        When the section heading spans multiple lines, we don't want to treat them as separate but as a single heading.
+        The method consumes a list of dicts and returns a _new_ list of dicts that is of the same length or shorter.
+        """
+        result: List[dict] = []
+
+        # imagine the the original list to be
+        # 0,asd,qwertz,
+        # 1,a very long section,
+        # 2,heading that spans,
+        # 3,multiple lines,
+        # 4,Foo,Bar,Y
+        # 5,Baz,Boom,Z
+        # we then want to merge the lines with index 1-3 into a single line
+        keys_that_must_no_hold_any_values: Set[str] = {
+            "Segment",
+            "Datenelement",
+            "Codes und Qualifier",
+            "Beschreibung",
+            "Bedingung",
+        }
+
+        def line_only_contains_segment_gruppe(raw_line: dict) -> bool:
+            """
+            returns true if the given raw line only contains some meaningful data in the "Segment Gruppe" key
+            """
+            for row_key in keys_that_must_no_hold_any_values:
+                if row_key in raw_line and raw_line[row_key] is not None and len(raw_line[row_key].strip()) > 0:
+                    return False
+            return True
+
+        merged_section_name = ""
+        number_of_lines_merged = 0
+        for raw_line in raw_lines:
+            if (
+                "Segment Gruppe" in raw_line
+                and raw_line["Segment Gruppe"]
+                and line_only_contains_segment_gruppe(raw_line)
+                and not raw_line["Segment Gruppe"].startswith("SG")
+            ):
+                merged_section_name += " " + raw_line["Segment Gruppe"]
+                number_of_lines_merged += 1
+            else:
+                # note that AHBs never end with a section heading, so all headings/sections will run into this block
+                if len(merged_section_name) > 0:
+                    artificial_merged_line: dict = {
+                        "": str(int(raw_line[""]) - 1),
+                        "Segment Gruppe": merged_section_name.strip().replace("  ", " "),
+                    }
+                    for key in keys_that_must_no_hold_any_values:
+                        # although we know there's no meaningful value here, we still need the keys with empty values
+                        # so that to downstream code the line seems legit ➡ We re-add them.
+                        artificial_merged_line[key] = ""
+                    result.append(artificial_merged_line)
+                    merged_section_name = ""
+                    number_of_lines_merged = 0
+                result.append(raw_line)
+        return result
 
     def get_raw_rows(self, file_handle: TextIO) -> List[dict]:
         """

diff --git a/unit_tests/ahbs/FV2204/UTILMD/11042_deep.json b/unit_tests/ahbs/FV2204/UTILMD/11042_deep.json
@@ -462,7 +462,7 @@
                     }
                   ],
                   "discriminator": "NAD",
-                  "section_name": "Messstellenbetreibers"
+                  "section_name": "Korrespondenzanschrift des Kunden des Messstellenbetreibers"
                 },
                 {
                   "ahb_expression": "Muss",

diff --git a/unit_tests/test_ahb_csv_reader.py b/unit_tests/test_ahb_csv_reader.py
@@ -179,6 +179,16 @@ def test_csv_file_reading_11042(self, datafiles):
         path_to_csv: Path = datafiles / "11042.csv"
         reader = FlatAhbCsvReader(file_path=path_to_csv)
         assert len(reader.rows) == 846
+        assert (
+            len(
+                [
+                    r
+                    for r in reader.rows
+                    if r.section_name == "Korrespondenzanschrift des Kunden des Messstellenbetreibers"
+                ]
+            )
+            > 0
+        )  # this shows that the merging of sections spanning multiple lines works, see original CSV
         # first row assertions
         first_row = reader.rows[0]
         assert first_row.segment_code == "UNH"
@@ -194,3 +204,34 @@ def test_csv_file_reading_11042(self, datafiles):
         flat_ahb = reader.to_flat_ahb()
         assert len(flat_ahb.lines) < len(reader.rows)  # filter out the empty lines
         assert flat_ahb.get_segment_groups() == [None, "SG2", "SG3", "SG4", "SG5", "SG6", "SG8", "SG9", "SG10", "SG12"]
+
+    @pytest.mark.parametrize(
+        "input_lines,expected_lines",
+        [
+            pytest.param(
+                [
+                    {"": "0", "ASD": "asd"},
+                    {"": "1", "Segment Gruppe": "Das ist der Anfang"},
+                    {"": "2", "Segment Gruppe": "einer sehr langen"},
+                    {"": "3", "Segment Gruppe": "Geschichte."},
+                    {"": "4", "Foo": "Bar"},
+                ],
+                [
+                    {"": "0", "ASD": "asd"},
+                    {
+                        "": "3",
+                        "Segment Gruppe": "Das ist der Anfang einer sehr langen Geschichte.",
+                        "Bedingung": "",
+                        "Beschreibung": "",
+                        "Codes und Qualifier": "",
+                        "Datenelement": "",
+                        "Segment": "",
+                    },
+                    {"": "4", "Foo": "Bar"},
+                ],
+            )
+        ],
+    )
+    def test_merging_of_section_only_lines(self, input_lines: List[dict], expected_lines: List[dict]):
+        actual = FlatAhbCsvReader.merge_section_only_lines(input_lines)
+        assert actual == expected_lines