From 7098c4f0dce0a290957f4e67de151539227821cc Mon Sep 17 00:00:00 2001 From: Dan Homola Date: Thu, 21 Mar 2024 21:34:19 +0100 Subject: [PATCH] Optimize the normal form detection (#123) * optimize normal_form The even_rows was always called with every_row_has_delim. This meant possibly two full scans of all the rows. By joining those two functions, we can save one of the scans. Also, the logic previously implemented by even_rows now exits early whenever possible (previous implementation used to scan the whole file no matter what). * further optimize normal_form Avoid unnecessary splitting and joining of the rows. The current implementation would split the file into rows in each of the is_form_x separately. They all would do it the same way. So instead, we can split the file once and pass the lines to the is_form_x directly. It also allows us to avoid "re-joining" of the lines in is_form_5 when it calls the is_form_2. The test_normal_forms test inputs were changed accordingly: they are split by the `\n` and the trailing newlines were manually removed (the actual code will always strip the trailing newlines before calling the is_form_x functions). --- clevercsv/normal_form.py | 69 ++++++++++------------- tests/test_unit/test_normal_forms.py | 84 ++++++++++++++-------------- 2 files changed, 74 insertions(+), 79 deletions(-) diff --git a/clevercsv/normal_form.py b/clevercsv/normal_form.py index c56b063..aff21fa 100644 --- a/clevercsv/normal_form.py +++ b/clevercsv/normal_form.py @@ -4,8 +4,8 @@ Detect the dialect with very strict functional tests. This module uses so-called "normal forms" to detect the dialect of CSV files. -Normal forms are detected with strict functional tests. The normal forms are -used as a pre-test to check if files are simple enough that computing the data +Normal forms are detected with strict functional tests. The normal forms are +used as a pre-test to check if files are simple enough that computing the data consistency measure is not necessary. Author: Gertjan van den Burg @@ -62,7 +62,7 @@ def detect_dialect_normal( return None form_and_dialect: List[ - Tuple[int, Callable[[str, SimpleDialect], bool], SimpleDialect] + Tuple[int, Callable[[List[str], SimpleDialect], bool], SimpleDialect] ] = [] for delim in delimiters: @@ -90,8 +90,10 @@ def detect_dialect_normal( ) ) + rows = split_file(data) + for ID, form_func, dialect in form_and_dialect: - if form_func(data, dialect): + if form_func(rows, dialect): if verbose: print("Matched normal form %i." % ID) return dialect @@ -169,6 +171,22 @@ def every_row_has_delim(rows: List[str], dialect: SimpleDialect) -> bool: return True +def every_row_has_delim_and_is_the_same_length( + rows: List[str], dialect: SimpleDialect +) -> bool: + assert dialect.delimiter is not None + if len(rows) == 0: + return False + + first_len = len(split_row(rows[0], dialect)) + for row in rows: + if not has_delimiter(row, dialect.delimiter): + return False + if len(split_row(row, dialect)) != first_len: + return False + return True + + def is_elementary(cell: str) -> bool: return ( regex.fullmatch(r"[a-zA-Z0-9\.\_\&\-\@\+\%\(\)\ \/]+", cell) @@ -176,13 +194,6 @@ def is_elementary(cell: str) -> bool: ) -def even_rows(rows: List[str], dialect: SimpleDialect) -> bool: - cells_per_row = set() - for row in rows: - cells_per_row.add(len(split_row(row, dialect))) - return len(cells_per_row) == 1 - - def split_file(data: str) -> List[str]: data = strip_trailing_crnl(data) if "\r\n" in data: @@ -219,16 +230,12 @@ def split_row(row: str, dialect: SimpleDialect) -> List[str]: return cells -def is_form_1(data: str, dialect: SimpleDialect) -> bool: +def is_form_1(rows: List[str], dialect: SimpleDialect) -> bool: # All cells quoted, quoted empty allowed, no nested quotes, more than one # column assert dialect.quotechar is not None - rows = split_file(data) - - if not every_row_has_delim(rows, dialect): - return False - if not even_rows(rows, dialect): + if not every_row_has_delim_and_is_the_same_length(rows, dialect): return False for row in rows: @@ -251,14 +258,9 @@ def is_form_1(data: str, dialect: SimpleDialect) -> bool: return True -def is_form_2(data: str, dialect: SimpleDialect) -> bool: +def is_form_2(rows: List[str], dialect: SimpleDialect) -> bool: # All unquoted, empty allowed, all elementary - - rows = split_file(data) - - if not every_row_has_delim(rows, dialect): - return False - if not even_rows(rows, dialect): + if not every_row_has_delim_and_is_the_same_length(rows, dialect): return False for row in rows: @@ -278,15 +280,11 @@ def is_form_2(data: str, dialect: SimpleDialect) -> bool: return True -def is_form_3(data: str, dialect: SimpleDialect) -> bool: +def is_form_3(rows: List[str], dialect: SimpleDialect) -> bool: # some quoted, some not quoted, no empty, no nested quotes assert dialect.quotechar is not None - rows = split_file(data) - - if not every_row_has_delim(rows, dialect): - return False - if not even_rows(rows, dialect): + if not every_row_has_delim_and_is_the_same_length(rows, dialect): return False if len(rows) <= 1: return False @@ -315,12 +313,10 @@ def is_form_3(data: str, dialect: SimpleDialect) -> bool: return True -def is_form_4(data: str, dialect: SimpleDialect) -> bool: +def is_form_4(rows: List[str], dialect: SimpleDialect) -> bool: # no delim, single column (either entirely quoted or entirely unquoted) assert dialect.quotechar is not None - rows = split_file(data) - if len(rows) <= 1: return False @@ -342,12 +338,9 @@ def is_form_4(data: str, dialect: SimpleDialect) -> bool: return True -def is_form_5(data: str, dialect: SimpleDialect) -> bool: +def is_form_5(rows: List[str], dialect: SimpleDialect) -> bool: # all rows quoted, no nested quotes # basically form 2 but with quotes around each row - - rows = split_file(data) - if not every_row_has_delim(rows, dialect): return False if len(rows) <= 1: @@ -365,4 +358,4 @@ def is_form_5(data: str, dialect: SimpleDialect) -> bool: for row in rows: newrows.append(row[1:-1]) - return is_form_2("\n".join(newrows), dialect) + return is_form_2(newrows, dialect) diff --git a/tests/test_unit/test_normal_forms.py b/tests/test_unit/test_normal_forms.py index 3e83480..db719f5 100644 --- a/tests/test_unit/test_normal_forms.py +++ b/tests/test_unit/test_normal_forms.py @@ -21,71 +21,73 @@ class NormalFormTestCase(unittest.TestCase): def test_form_1(self) -> None: dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") - self.assertTrue(is_form_1('"A","B","C"', dialect)) - self.assertTrue(is_form_1('"A","B"\n"C","D"\n', dialect)) - self.assertTrue(is_form_1('"A","","C"', dialect)) + self.assertTrue(is_form_1('"A","B","C"'.split("\n"), dialect)) + self.assertTrue(is_form_1('"A","B"\n"C","D"'.split("\n"), dialect)) + self.assertTrue(is_form_1('"A","","C"'.split("\n"), dialect)) - self.assertFalse(is_form_1('"A","B"\n"A"', dialect)) - self.assertFalse(is_form_1('"A"\n"B"', dialect)) - self.assertFalse(is_form_1('"A"\n"A","B"', dialect)) - self.assertFalse(is_form_1('"A",,"C"', dialect)) - self.assertFalse(is_form_1('"A",C', dialect)) - self.assertFalse(is_form_1('"A"\n"b""A""c","B"', dialect)) + self.assertFalse(is_form_1('"A","B"\n"A"'.split("\n"), dialect)) + self.assertFalse(is_form_1('"A"\n"B"'.split("\n"), dialect)) + self.assertFalse(is_form_1('"A"\n"A","B"'.split("\n"), dialect)) + self.assertFalse(is_form_1('"A",,"C"'.split("\n"), dialect)) + self.assertFalse(is_form_1('"A",C'.split("\n"), dialect)) + self.assertFalse(is_form_1('"A"\n"b""A""c","B"'.split("\n"), dialect)) def test_form_2(self) -> None: dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="") - self.assertTrue(is_form_2("1,2,3", dialect)) - self.assertTrue(is_form_2("1,2,3\na,b,c\n", dialect)) - self.assertTrue(is_form_2("a@b.com,3", dialect)) - self.assertTrue(is_form_2("a,,3\n1,2,3", dialect)) + self.assertTrue(is_form_2("1,2,3".split("\n"), dialect)) + self.assertTrue(is_form_2("1,2,3\na,b,c".split("\n"), dialect)) + self.assertTrue(is_form_2("a@b.com,3".split("\n"), dialect)) + self.assertTrue(is_form_2("a,,3\n1,2,3".split("\n"), dialect)) - self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6", dialect)) - self.assertFalse(is_form_2("1", dialect)) - self.assertFalse(is_form_2('1,"a"', dialect)) - self.assertFalse(is_form_2("a;b,3", dialect)) - self.assertFalse(is_form_2('"a,3,3\n1,2,3', dialect)) - self.assertFalse(is_form_2('a,"",3\n1,2,3', dialect)) + self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6".split("\n"), dialect)) + self.assertFalse(is_form_2("1".split("\n"), dialect)) + self.assertFalse(is_form_2('1,"a"'.split("\n"), dialect)) + self.assertFalse(is_form_2("a;b,3".split("\n"), dialect)) + self.assertFalse(is_form_2('"a,3,3\n1,2,3'.split("\n"), dialect)) + self.assertFalse(is_form_2('a,"",3\n1,2,3'.split("\n"), dialect)) def test_form_3(self) -> None: A = SimpleDialect(delimiter=",", quotechar="'", escapechar="") Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="") - self.assertTrue(is_form_3('A,B\nC,"D"', Q)) - self.assertTrue(is_form_3('A,B\nC,"d,e"', Q)) + self.assertTrue(is_form_3('A,B\nC,"D"'.split("\n"), Q)) + self.assertTrue(is_form_3('A,B\nC,"d,e"'.split("\n"), Q)) - self.assertFalse(is_form_3('A,\nC,"d,e"', Q)) - self.assertFalse(is_form_3("3;4,B\nC,D", Q)) + self.assertFalse(is_form_3('A,\nC,"d,e"'.split("\n"), Q)) + self.assertFalse(is_form_3("3;4,B\nC,D".split("\n"), Q)) - self.assertFalse(is_form_3('A,B\n"C",D\n', A)) - self.assertTrue(is_form_3('A,B\n"C",D\n', Q)) + self.assertFalse(is_form_3('A,B\n"C",D'.split("\n"), A)) + self.assertTrue(is_form_3('A,B\n"C",D'.split("\n"), Q)) def test_form_4(self) -> None: quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="") unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="") - self.assertTrue(is_form_4("A\nB\nC", unquoted)) - self.assertTrue(is_form_4("1\n2\n3", unquoted)) - self.assertTrue(is_form_4("A_B\n1\n2", unquoted)) - self.assertTrue(is_form_4("A&B\n1\n2", unquoted)) - self.assertTrue(is_form_4("A&B\n-1\n2", unquoted)) - self.assertTrue(is_form_4('"A"\n"B"\n"C"\n', quoted)) + self.assertTrue(is_form_4("A\nB\nC".split("\n"), unquoted)) + self.assertTrue(is_form_4("1\n2\n3".split("\n"), unquoted)) + self.assertTrue(is_form_4("A_B\n1\n2".split("\n"), unquoted)) + self.assertTrue(is_form_4("A&B\n1\n2".split("\n"), unquoted)) + self.assertTrue(is_form_4("A&B\n-1\n2".split("\n"), unquoted)) + self.assertTrue(is_form_4('"A"\n"B"\n"C"'.split("\n"), quoted)) - self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"\n', quoted)) - self.assertFalse(is_form_4('"A","B"\n"B"\n"C"\n', quoted)) - self.assertFalse(is_form_4('"A@b"\n"B"\n"C"\n', quoted)) - self.assertFalse(is_form_4('A\n"-1"\n2', unquoted)) - self.assertFalse(is_form_4("A B\n-1 3\n2 4", unquoted)) + self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"'.split("\n"), quoted)) + self.assertFalse(is_form_4('"A","B"\n"B"\n"C"'.split("\n"), quoted)) + self.assertFalse(is_form_4('"A@b"\n"B"\n"C"'.split("\n"), quoted)) + self.assertFalse(is_form_4('A\n"-1"\n2'.split("\n"), unquoted)) + self.assertFalse(is_form_4("A B\n-1 3\n2 4".split("\n"), unquoted)) def test_form_5(self) -> None: dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") - self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"', dialect)) - self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"', dialect)) + self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"'.split("\n"), dialect)) + self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"'.split("\n"), dialect)) - self.assertFalse(is_form_5("A,B\n1,2\n3,4", dialect)) - self.assertFalse(is_form_5("A,B\n1,\n2,3", dialect)) - self.assertFalse(is_form_5('"A,""B"""\n"1,"\n"2,3"', dialect)) + self.assertFalse(is_form_5("A,B\n1,2\n3,4".split("\n"), dialect)) + self.assertFalse(is_form_5("A,B\n1,\n2,3".split("\n"), dialect)) + self.assertFalse( + is_form_5('"A,""B"""\n"1,"\n"2,3"'.split("\n"), dialect) + ) if __name__ == "__main__":