Skip to content

Commit

Permalink
Optimize the normal form detection (#123)
Browse files Browse the repository at this point in the history
* optimize normal_form

The even_rows was always called with every_row_has_delim. This meant possibly two full scans of all the rows.

By joining those two functions, we can save one of the scans. Also, the logic previously implemented by even_rows now exits early whenever possible (previous implementation used to scan the whole file no matter what).

* further optimize normal_form

Avoid unnecessary splitting and joining of the rows. The current implementation would split the file into rows in each of the is_form_x separately. They all would do it the same way.

So instead, we can split the file once and pass the lines to the is_form_x directly. It also allows us to avoid "re-joining" of the lines in is_form_5 when it calls the is_form_2.

The test_normal_forms test inputs were changed accordingly: they are split by the `\n` and the trailing newlines were manually removed (the actual code will always strip the trailing newlines before calling the is_form_x functions).
  • Loading branch information
no23reason authored Mar 21, 2024
1 parent 4a7a270 commit 7098c4f
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 79 deletions.
69 changes: 31 additions & 38 deletions clevercsv/normal_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
Detect the dialect with very strict functional tests.
This module uses so-called "normal forms" to detect the dialect of CSV files.
Normal forms are detected with strict functional tests. The normal forms are
used as a pre-test to check if files are simple enough that computing the data
Normal forms are detected with strict functional tests. The normal forms are
used as a pre-test to check if files are simple enough that computing the data
consistency measure is not necessary.
Author: Gertjan van den Burg
Expand Down Expand Up @@ -62,7 +62,7 @@ def detect_dialect_normal(
return None

form_and_dialect: List[
Tuple[int, Callable[[str, SimpleDialect], bool], SimpleDialect]
Tuple[int, Callable[[List[str], SimpleDialect], bool], SimpleDialect]
] = []

for delim in delimiters:
Expand Down Expand Up @@ -90,8 +90,10 @@ def detect_dialect_normal(
)
)

rows = split_file(data)

for ID, form_func, dialect in form_and_dialect:
if form_func(data, dialect):
if form_func(rows, dialect):
if verbose:
print("Matched normal form %i." % ID)
return dialect
Expand Down Expand Up @@ -169,20 +171,29 @@ def every_row_has_delim(rows: List[str], dialect: SimpleDialect) -> bool:
return True


def every_row_has_delim_and_is_the_same_length(
rows: List[str], dialect: SimpleDialect
) -> bool:
assert dialect.delimiter is not None
if len(rows) == 0:
return False

first_len = len(split_row(rows[0], dialect))
for row in rows:
if not has_delimiter(row, dialect.delimiter):
return False
if len(split_row(row, dialect)) != first_len:
return False
return True


def is_elementary(cell: str) -> bool:
return (
regex.fullmatch(r"[a-zA-Z0-9\.\_\&\-\@\+\%\(\)\ \/]+", cell)
is not None
)


def even_rows(rows: List[str], dialect: SimpleDialect) -> bool:
cells_per_row = set()
for row in rows:
cells_per_row.add(len(split_row(row, dialect)))
return len(cells_per_row) == 1


def split_file(data: str) -> List[str]:
data = strip_trailing_crnl(data)
if "\r\n" in data:
Expand Down Expand Up @@ -219,16 +230,12 @@ def split_row(row: str, dialect: SimpleDialect) -> List[str]:
return cells


def is_form_1(data: str, dialect: SimpleDialect) -> bool:
def is_form_1(rows: List[str], dialect: SimpleDialect) -> bool:
# All cells quoted, quoted empty allowed, no nested quotes, more than one
# column
assert dialect.quotechar is not None

rows = split_file(data)

if not every_row_has_delim(rows, dialect):
return False
if not even_rows(rows, dialect):
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
return False

for row in rows:
Expand All @@ -251,14 +258,9 @@ def is_form_1(data: str, dialect: SimpleDialect) -> bool:
return True


def is_form_2(data: str, dialect: SimpleDialect) -> bool:
def is_form_2(rows: List[str], dialect: SimpleDialect) -> bool:
# All unquoted, empty allowed, all elementary

rows = split_file(data)

if not every_row_has_delim(rows, dialect):
return False
if not even_rows(rows, dialect):
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
return False

for row in rows:
Expand All @@ -278,15 +280,11 @@ def is_form_2(data: str, dialect: SimpleDialect) -> bool:
return True


def is_form_3(data: str, dialect: SimpleDialect) -> bool:
def is_form_3(rows: List[str], dialect: SimpleDialect) -> bool:
# some quoted, some not quoted, no empty, no nested quotes
assert dialect.quotechar is not None

rows = split_file(data)

if not every_row_has_delim(rows, dialect):
return False
if not even_rows(rows, dialect):
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
return False
if len(rows) <= 1:
return False
Expand Down Expand Up @@ -315,12 +313,10 @@ def is_form_3(data: str, dialect: SimpleDialect) -> bool:
return True


def is_form_4(data: str, dialect: SimpleDialect) -> bool:
def is_form_4(rows: List[str], dialect: SimpleDialect) -> bool:
# no delim, single column (either entirely quoted or entirely unquoted)
assert dialect.quotechar is not None

rows = split_file(data)

if len(rows) <= 1:
return False

Expand All @@ -342,12 +338,9 @@ def is_form_4(data: str, dialect: SimpleDialect) -> bool:
return True


def is_form_5(data: str, dialect: SimpleDialect) -> bool:
def is_form_5(rows: List[str], dialect: SimpleDialect) -> bool:
# all rows quoted, no nested quotes
# basically form 2 but with quotes around each row

rows = split_file(data)

if not every_row_has_delim(rows, dialect):
return False
if len(rows) <= 1:
Expand All @@ -365,4 +358,4 @@ def is_form_5(data: str, dialect: SimpleDialect) -> bool:
for row in rows:
newrows.append(row[1:-1])

return is_form_2("\n".join(newrows), dialect)
return is_form_2(newrows, dialect)
84 changes: 43 additions & 41 deletions tests/test_unit/test_normal_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,71 +21,73 @@ class NormalFormTestCase(unittest.TestCase):
def test_form_1(self) -> None:
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")

self.assertTrue(is_form_1('"A","B","C"', dialect))
self.assertTrue(is_form_1('"A","B"\n"C","D"\n', dialect))
self.assertTrue(is_form_1('"A","","C"', dialect))
self.assertTrue(is_form_1('"A","B","C"'.split("\n"), dialect))
self.assertTrue(is_form_1('"A","B"\n"C","D"'.split("\n"), dialect))
self.assertTrue(is_form_1('"A","","C"'.split("\n"), dialect))

self.assertFalse(is_form_1('"A","B"\n"A"', dialect))
self.assertFalse(is_form_1('"A"\n"B"', dialect))
self.assertFalse(is_form_1('"A"\n"A","B"', dialect))
self.assertFalse(is_form_1('"A",,"C"', dialect))
self.assertFalse(is_form_1('"A",C', dialect))
self.assertFalse(is_form_1('"A"\n"b""A""c","B"', dialect))
self.assertFalse(is_form_1('"A","B"\n"A"'.split("\n"), dialect))
self.assertFalse(is_form_1('"A"\n"B"'.split("\n"), dialect))
self.assertFalse(is_form_1('"A"\n"A","B"'.split("\n"), dialect))
self.assertFalse(is_form_1('"A",,"C"'.split("\n"), dialect))
self.assertFalse(is_form_1('"A",C'.split("\n"), dialect))
self.assertFalse(is_form_1('"A"\n"b""A""c","B"'.split("\n"), dialect))

def test_form_2(self) -> None:
dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="")

self.assertTrue(is_form_2("1,2,3", dialect))
self.assertTrue(is_form_2("1,2,3\na,b,c\n", dialect))
self.assertTrue(is_form_2("[email protected],3", dialect))
self.assertTrue(is_form_2("a,,3\n1,2,3", dialect))
self.assertTrue(is_form_2("1,2,3".split("\n"), dialect))
self.assertTrue(is_form_2("1,2,3\na,b,c".split("\n"), dialect))
self.assertTrue(is_form_2("[email protected],3".split("\n"), dialect))
self.assertTrue(is_form_2("a,,3\n1,2,3".split("\n"), dialect))

self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6", dialect))
self.assertFalse(is_form_2("1", dialect))
self.assertFalse(is_form_2('1,"a"', dialect))
self.assertFalse(is_form_2("a;b,3", dialect))
self.assertFalse(is_form_2('"a,3,3\n1,2,3', dialect))
self.assertFalse(is_form_2('a,"",3\n1,2,3', dialect))
self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6".split("\n"), dialect))
self.assertFalse(is_form_2("1".split("\n"), dialect))
self.assertFalse(is_form_2('1,"a"'.split("\n"), dialect))
self.assertFalse(is_form_2("a;b,3".split("\n"), dialect))
self.assertFalse(is_form_2('"a,3,3\n1,2,3'.split("\n"), dialect))
self.assertFalse(is_form_2('a,"",3\n1,2,3'.split("\n"), dialect))

def test_form_3(self) -> None:
A = SimpleDialect(delimiter=",", quotechar="'", escapechar="")
Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="")

self.assertTrue(is_form_3('A,B\nC,"D"', Q))
self.assertTrue(is_form_3('A,B\nC,"d,e"', Q))
self.assertTrue(is_form_3('A,B\nC,"D"'.split("\n"), Q))
self.assertTrue(is_form_3('A,B\nC,"d,e"'.split("\n"), Q))

self.assertFalse(is_form_3('A,\nC,"d,e"', Q))
self.assertFalse(is_form_3("3;4,B\nC,D", Q))
self.assertFalse(is_form_3('A,\nC,"d,e"'.split("\n"), Q))
self.assertFalse(is_form_3("3;4,B\nC,D".split("\n"), Q))

self.assertFalse(is_form_3('A,B\n"C",D\n', A))
self.assertTrue(is_form_3('A,B\n"C",D\n', Q))
self.assertFalse(is_form_3('A,B\n"C",D'.split("\n"), A))
self.assertTrue(is_form_3('A,B\n"C",D'.split("\n"), Q))

def test_form_4(self) -> None:
quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="")
unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="")

self.assertTrue(is_form_4("A\nB\nC", unquoted))
self.assertTrue(is_form_4("1\n2\n3", unquoted))
self.assertTrue(is_form_4("A_B\n1\n2", unquoted))
self.assertTrue(is_form_4("A&B\n1\n2", unquoted))
self.assertTrue(is_form_4("A&B\n-1\n2", unquoted))
self.assertTrue(is_form_4('"A"\n"B"\n"C"\n', quoted))
self.assertTrue(is_form_4("A\nB\nC".split("\n"), unquoted))
self.assertTrue(is_form_4("1\n2\n3".split("\n"), unquoted))
self.assertTrue(is_form_4("A_B\n1\n2".split("\n"), unquoted))
self.assertTrue(is_form_4("A&B\n1\n2".split("\n"), unquoted))
self.assertTrue(is_form_4("A&B\n-1\n2".split("\n"), unquoted))
self.assertTrue(is_form_4('"A"\n"B"\n"C"'.split("\n"), quoted))

self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"\n', quoted))
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"\n', quoted))
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"\n', quoted))
self.assertFalse(is_form_4('A\n"-1"\n2', unquoted))
self.assertFalse(is_form_4("A B\n-1 3\n2 4", unquoted))
self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"'.split("\n"), quoted))
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"'.split("\n"), quoted))
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"'.split("\n"), quoted))
self.assertFalse(is_form_4('A\n"-1"\n2'.split("\n"), unquoted))
self.assertFalse(is_form_4("A B\n-1 3\n2 4".split("\n"), unquoted))

def test_form_5(self) -> None:
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")

self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"', dialect))
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"', dialect))
self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"'.split("\n"), dialect))
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"'.split("\n"), dialect))

self.assertFalse(is_form_5("A,B\n1,2\n3,4", dialect))
self.assertFalse(is_form_5("A,B\n1,\n2,3", dialect))
self.assertFalse(is_form_5('"A,""B"""\n"1,"\n"2,3"', dialect))
self.assertFalse(is_form_5("A,B\n1,2\n3,4".split("\n"), dialect))
self.assertFalse(is_form_5("A,B\n1,\n2,3".split("\n"), dialect))
self.assertFalse(
is_form_5('"A,""B"""\n"1,"\n"2,3"'.split("\n"), dialect)
)


if __name__ == "__main__":
Expand Down

0 comments on commit 7098c4f

Please sign in to comment.