-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimize the normal form detection (#123)
* optimize normal_form The even_rows was always called with every_row_has_delim. This meant possibly two full scans of all the rows. By joining those two functions, we can save one of the scans. Also, the logic previously implemented by even_rows now exits early whenever possible (previous implementation used to scan the whole file no matter what). * further optimize normal_form Avoid unnecessary splitting and joining of the rows. The current implementation would split the file into rows in each of the is_form_x separately. They all would do it the same way. So instead, we can split the file once and pass the lines to the is_form_x directly. It also allows us to avoid "re-joining" of the lines in is_form_5 when it calls the is_form_2. The test_normal_forms test inputs were changed accordingly: they are split by the `\n` and the trailing newlines were manually removed (the actual code will always strip the trailing newlines before calling the is_form_x functions).
- Loading branch information
1 parent
4a7a270
commit 7098c4f
Showing
2 changed files
with
74 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,71 +21,73 @@ class NormalFormTestCase(unittest.TestCase): | |
def test_form_1(self) -> None: | ||
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") | ||
|
||
self.assertTrue(is_form_1('"A","B","C"', dialect)) | ||
self.assertTrue(is_form_1('"A","B"\n"C","D"\n', dialect)) | ||
self.assertTrue(is_form_1('"A","","C"', dialect)) | ||
self.assertTrue(is_form_1('"A","B","C"'.split("\n"), dialect)) | ||
self.assertTrue(is_form_1('"A","B"\n"C","D"'.split("\n"), dialect)) | ||
self.assertTrue(is_form_1('"A","","C"'.split("\n"), dialect)) | ||
|
||
self.assertFalse(is_form_1('"A","B"\n"A"', dialect)) | ||
self.assertFalse(is_form_1('"A"\n"B"', dialect)) | ||
self.assertFalse(is_form_1('"A"\n"A","B"', dialect)) | ||
self.assertFalse(is_form_1('"A",,"C"', dialect)) | ||
self.assertFalse(is_form_1('"A",C', dialect)) | ||
self.assertFalse(is_form_1('"A"\n"b""A""c","B"', dialect)) | ||
self.assertFalse(is_form_1('"A","B"\n"A"'.split("\n"), dialect)) | ||
self.assertFalse(is_form_1('"A"\n"B"'.split("\n"), dialect)) | ||
self.assertFalse(is_form_1('"A"\n"A","B"'.split("\n"), dialect)) | ||
self.assertFalse(is_form_1('"A",,"C"'.split("\n"), dialect)) | ||
self.assertFalse(is_form_1('"A",C'.split("\n"), dialect)) | ||
self.assertFalse(is_form_1('"A"\n"b""A""c","B"'.split("\n"), dialect)) | ||
|
||
def test_form_2(self) -> None: | ||
dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="") | ||
|
||
self.assertTrue(is_form_2("1,2,3", dialect)) | ||
self.assertTrue(is_form_2("1,2,3\na,b,c\n", dialect)) | ||
self.assertTrue(is_form_2("[email protected],3", dialect)) | ||
self.assertTrue(is_form_2("a,,3\n1,2,3", dialect)) | ||
self.assertTrue(is_form_2("1,2,3".split("\n"), dialect)) | ||
self.assertTrue(is_form_2("1,2,3\na,b,c".split("\n"), dialect)) | ||
self.assertTrue(is_form_2("[email protected],3".split("\n"), dialect)) | ||
self.assertTrue(is_form_2("a,,3\n1,2,3".split("\n"), dialect)) | ||
|
||
self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6", dialect)) | ||
self.assertFalse(is_form_2("1", dialect)) | ||
self.assertFalse(is_form_2('1,"a"', dialect)) | ||
self.assertFalse(is_form_2("a;b,3", dialect)) | ||
self.assertFalse(is_form_2('"a,3,3\n1,2,3', dialect)) | ||
self.assertFalse(is_form_2('a,"",3\n1,2,3', dialect)) | ||
self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6".split("\n"), dialect)) | ||
self.assertFalse(is_form_2("1".split("\n"), dialect)) | ||
self.assertFalse(is_form_2('1,"a"'.split("\n"), dialect)) | ||
self.assertFalse(is_form_2("a;b,3".split("\n"), dialect)) | ||
self.assertFalse(is_form_2('"a,3,3\n1,2,3'.split("\n"), dialect)) | ||
self.assertFalse(is_form_2('a,"",3\n1,2,3'.split("\n"), dialect)) | ||
|
||
def test_form_3(self) -> None: | ||
A = SimpleDialect(delimiter=",", quotechar="'", escapechar="") | ||
Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="") | ||
|
||
self.assertTrue(is_form_3('A,B\nC,"D"', Q)) | ||
self.assertTrue(is_form_3('A,B\nC,"d,e"', Q)) | ||
self.assertTrue(is_form_3('A,B\nC,"D"'.split("\n"), Q)) | ||
self.assertTrue(is_form_3('A,B\nC,"d,e"'.split("\n"), Q)) | ||
|
||
self.assertFalse(is_form_3('A,\nC,"d,e"', Q)) | ||
self.assertFalse(is_form_3("3;4,B\nC,D", Q)) | ||
self.assertFalse(is_form_3('A,\nC,"d,e"'.split("\n"), Q)) | ||
self.assertFalse(is_form_3("3;4,B\nC,D".split("\n"), Q)) | ||
|
||
self.assertFalse(is_form_3('A,B\n"C",D\n', A)) | ||
self.assertTrue(is_form_3('A,B\n"C",D\n', Q)) | ||
self.assertFalse(is_form_3('A,B\n"C",D'.split("\n"), A)) | ||
self.assertTrue(is_form_3('A,B\n"C",D'.split("\n"), Q)) | ||
|
||
def test_form_4(self) -> None: | ||
quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="") | ||
unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="") | ||
|
||
self.assertTrue(is_form_4("A\nB\nC", unquoted)) | ||
self.assertTrue(is_form_4("1\n2\n3", unquoted)) | ||
self.assertTrue(is_form_4("A_B\n1\n2", unquoted)) | ||
self.assertTrue(is_form_4("A&B\n1\n2", unquoted)) | ||
self.assertTrue(is_form_4("A&B\n-1\n2", unquoted)) | ||
self.assertTrue(is_form_4('"A"\n"B"\n"C"\n', quoted)) | ||
self.assertTrue(is_form_4("A\nB\nC".split("\n"), unquoted)) | ||
self.assertTrue(is_form_4("1\n2\n3".split("\n"), unquoted)) | ||
self.assertTrue(is_form_4("A_B\n1\n2".split("\n"), unquoted)) | ||
self.assertTrue(is_form_4("A&B\n1\n2".split("\n"), unquoted)) | ||
self.assertTrue(is_form_4("A&B\n-1\n2".split("\n"), unquoted)) | ||
self.assertTrue(is_form_4('"A"\n"B"\n"C"'.split("\n"), quoted)) | ||
|
||
self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"\n', quoted)) | ||
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"\n', quoted)) | ||
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"\n', quoted)) | ||
self.assertFalse(is_form_4('A\n"-1"\n2', unquoted)) | ||
self.assertFalse(is_form_4("A B\n-1 3\n2 4", unquoted)) | ||
self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"'.split("\n"), quoted)) | ||
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"'.split("\n"), quoted)) | ||
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"'.split("\n"), quoted)) | ||
self.assertFalse(is_form_4('A\n"-1"\n2'.split("\n"), unquoted)) | ||
self.assertFalse(is_form_4("A B\n-1 3\n2 4".split("\n"), unquoted)) | ||
|
||
def test_form_5(self) -> None: | ||
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="") | ||
|
||
self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"', dialect)) | ||
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"', dialect)) | ||
self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"'.split("\n"), dialect)) | ||
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"'.split("\n"), dialect)) | ||
|
||
self.assertFalse(is_form_5("A,B\n1,2\n3,4", dialect)) | ||
self.assertFalse(is_form_5("A,B\n1,\n2,3", dialect)) | ||
self.assertFalse(is_form_5('"A,""B"""\n"1,"\n"2,3"', dialect)) | ||
self.assertFalse(is_form_5("A,B\n1,2\n3,4".split("\n"), dialect)) | ||
self.assertFalse(is_form_5("A,B\n1,\n2,3".split("\n"), dialect)) | ||
self.assertFalse( | ||
is_form_5('"A,""B"""\n"1,"\n"2,3"'.split("\n"), dialect) | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
|