Skip to content

Commit 5d50da2

Browse files
committed
further optimize normal_form
Avoid unnecessary splitting and joining of the rows. The current implementation would split the file into rows in each of the is_form_x separately. They all would do it the same way. So instead, we can split the file once and pass the lines to the is_form_x directly. It also allows us to avoid "re-joining" of the lines in is_form_5 when it calls the is_form_2. The test_normal_forms test inputs were changed accordingly: they are split by the `\n` and the trailing newlines were manually removed (the actual code will always strip the trailing newlines before calling the is_form_x functions).
1 parent 733b7cd commit 5d50da2

File tree

2 files changed

+53
-61
lines changed

2 files changed

+53
-61
lines changed

clevercsv/normal_form.py

+10-20
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def detect_dialect_normal(
6262
return None
6363

6464
form_and_dialect: List[
65-
Tuple[int, Callable[[str, SimpleDialect], bool], SimpleDialect]
65+
Tuple[int, Callable[[list[str], SimpleDialect], bool], SimpleDialect]
6666
] = []
6767

6868
for delim in delimiters:
@@ -90,8 +90,10 @@ def detect_dialect_normal(
9090
)
9191
)
9292

93+
rows = split_file(data)
94+
9395
for ID, form_func, dialect in form_and_dialect:
94-
if form_func(data, dialect):
96+
if form_func(rows, dialect):
9597
if verbose:
9698
print("Matched normal form %i." % ID)
9799
return dialect
@@ -228,13 +230,11 @@ def split_row(row: str, dialect: SimpleDialect) -> List[str]:
228230
return cells
229231

230232

231-
def is_form_1(data: str, dialect: SimpleDialect) -> bool:
233+
def is_form_1(rows: list[str], dialect: SimpleDialect) -> bool:
232234
# All cells quoted, quoted empty allowed, no nested quotes, more than one
233235
# column
234236
assert dialect.quotechar is not None
235237

236-
rows = split_file(data)
237-
238238
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
239239
return False
240240

@@ -258,11 +258,8 @@ def is_form_1(data: str, dialect: SimpleDialect) -> bool:
258258
return True
259259

260260

261-
def is_form_2(data: str, dialect: SimpleDialect) -> bool:
261+
def is_form_2(rows: list[str], dialect: SimpleDialect) -> bool:
262262
# All unquoted, empty allowed, all elementary
263-
264-
rows = split_file(data)
265-
266263
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
267264
return False
268265

@@ -283,12 +280,10 @@ def is_form_2(data: str, dialect: SimpleDialect) -> bool:
283280
return True
284281

285282

286-
def is_form_3(data: str, dialect: SimpleDialect) -> bool:
283+
def is_form_3(rows: list[str], dialect: SimpleDialect) -> bool:
287284
# some quoted, some not quoted, no empty, no nested quotes
288285
assert dialect.quotechar is not None
289286

290-
rows = split_file(data)
291-
292287
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
293288
return False
294289
if len(rows) <= 1:
@@ -318,12 +313,10 @@ def is_form_3(data: str, dialect: SimpleDialect) -> bool:
318313
return True
319314

320315

321-
def is_form_4(data: str, dialect: SimpleDialect) -> bool:
316+
def is_form_4(rows: list[str], dialect: SimpleDialect) -> bool:
322317
# no delim, single column (either entirely quoted or entirely unquoted)
323318
assert dialect.quotechar is not None
324319

325-
rows = split_file(data)
326-
327320
if len(rows) <= 1:
328321
return False
329322

@@ -345,12 +338,9 @@ def is_form_4(data: str, dialect: SimpleDialect) -> bool:
345338
return True
346339

347340

348-
def is_form_5(data: str, dialect: SimpleDialect) -> bool:
341+
def is_form_5(rows: list[str], dialect: SimpleDialect) -> bool:
349342
# all rows quoted, no nested quotes
350343
# basically form 2 but with quotes around each row
351-
352-
rows = split_file(data)
353-
354344
if not every_row_has_delim(rows, dialect):
355345
return False
356346
if len(rows) <= 1:
@@ -368,4 +358,4 @@ def is_form_5(data: str, dialect: SimpleDialect) -> bool:
368358
for row in rows:
369359
newrows.append(row[1:-1])
370360

371-
return is_form_2("\n".join(newrows), dialect)
361+
return is_form_2(newrows, dialect)

tests/test_unit/test_normal_forms.py

+43-41
Original file line numberDiff line numberDiff line change
@@ -21,71 +21,73 @@ class NormalFormTestCase(unittest.TestCase):
2121
def test_form_1(self) -> None:
2222
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
2323

24-
self.assertTrue(is_form_1('"A","B","C"', dialect))
25-
self.assertTrue(is_form_1('"A","B"\n"C","D"\n', dialect))
26-
self.assertTrue(is_form_1('"A","","C"', dialect))
24+
self.assertTrue(is_form_1('"A","B","C"'.split("\n"), dialect))
25+
self.assertTrue(is_form_1('"A","B"\n"C","D"'.split("\n"), dialect))
26+
self.assertTrue(is_form_1('"A","","C"'.split("\n"), dialect))
2727

28-
self.assertFalse(is_form_1('"A","B"\n"A"', dialect))
29-
self.assertFalse(is_form_1('"A"\n"B"', dialect))
30-
self.assertFalse(is_form_1('"A"\n"A","B"', dialect))
31-
self.assertFalse(is_form_1('"A",,"C"', dialect))
32-
self.assertFalse(is_form_1('"A",C', dialect))
33-
self.assertFalse(is_form_1('"A"\n"b""A""c","B"', dialect))
28+
self.assertFalse(is_form_1('"A","B"\n"A"'.split("\n"), dialect))
29+
self.assertFalse(is_form_1('"A"\n"B"'.split("\n"), dialect))
30+
self.assertFalse(is_form_1('"A"\n"A","B"'.split("\n"), dialect))
31+
self.assertFalse(is_form_1('"A",,"C"'.split("\n"), dialect))
32+
self.assertFalse(is_form_1('"A",C'.split("\n"), dialect))
33+
self.assertFalse(is_form_1('"A"\n"b""A""c","B"'.split("\n"), dialect))
3434

3535
def test_form_2(self) -> None:
3636
dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="")
3737

38-
self.assertTrue(is_form_2("1,2,3", dialect))
39-
self.assertTrue(is_form_2("1,2,3\na,b,c\n", dialect))
40-
self.assertTrue(is_form_2("[email protected],3", dialect))
41-
self.assertTrue(is_form_2("a,,3\n1,2,3", dialect))
38+
self.assertTrue(is_form_2("1,2,3".split("\n"), dialect))
39+
self.assertTrue(is_form_2("1,2,3\na,b,c".split("\n"), dialect))
40+
self.assertTrue(is_form_2("[email protected],3".split("\n"), dialect))
41+
self.assertTrue(is_form_2("a,,3\n1,2,3".split("\n"), dialect))
4242

43-
self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6", dialect))
44-
self.assertFalse(is_form_2("1", dialect))
45-
self.assertFalse(is_form_2('1,"a"', dialect))
46-
self.assertFalse(is_form_2("a;b,3", dialect))
47-
self.assertFalse(is_form_2('"a,3,3\n1,2,3', dialect))
48-
self.assertFalse(is_form_2('a,"",3\n1,2,3', dialect))
43+
self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6".split("\n"), dialect))
44+
self.assertFalse(is_form_2("1".split("\n"), dialect))
45+
self.assertFalse(is_form_2('1,"a"'.split("\n"), dialect))
46+
self.assertFalse(is_form_2("a;b,3".split("\n"), dialect))
47+
self.assertFalse(is_form_2('"a,3,3\n1,2,3'.split("\n"), dialect))
48+
self.assertFalse(is_form_2('a,"",3\n1,2,3'.split("\n"), dialect))
4949

5050
def test_form_3(self) -> None:
5151
A = SimpleDialect(delimiter=",", quotechar="'", escapechar="")
5252
Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
5353

54-
self.assertTrue(is_form_3('A,B\nC,"D"', Q))
55-
self.assertTrue(is_form_3('A,B\nC,"d,e"', Q))
54+
self.assertTrue(is_form_3('A,B\nC,"D"'.split("\n"), Q))
55+
self.assertTrue(is_form_3('A,B\nC,"d,e"'.split("\n"), Q))
5656

57-
self.assertFalse(is_form_3('A,\nC,"d,e"', Q))
58-
self.assertFalse(is_form_3("3;4,B\nC,D", Q))
57+
self.assertFalse(is_form_3('A,\nC,"d,e"'.split("\n"), Q))
58+
self.assertFalse(is_form_3("3;4,B\nC,D".split("\n"), Q))
5959

60-
self.assertFalse(is_form_3('A,B\n"C",D\n', A))
61-
self.assertTrue(is_form_3('A,B\n"C",D\n', Q))
60+
self.assertFalse(is_form_3('A,B\n"C",D'.split("\n"), A))
61+
self.assertTrue(is_form_3('A,B\n"C",D'.split("\n"), Q))
6262

6363
def test_form_4(self) -> None:
6464
quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="")
6565
unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="")
6666

67-
self.assertTrue(is_form_4("A\nB\nC", unquoted))
68-
self.assertTrue(is_form_4("1\n2\n3", unquoted))
69-
self.assertTrue(is_form_4("A_B\n1\n2", unquoted))
70-
self.assertTrue(is_form_4("A&B\n1\n2", unquoted))
71-
self.assertTrue(is_form_4("A&B\n-1\n2", unquoted))
72-
self.assertTrue(is_form_4('"A"\n"B"\n"C"\n', quoted))
67+
self.assertTrue(is_form_4("A\nB\nC".split("\n"), unquoted))
68+
self.assertTrue(is_form_4("1\n2\n3".split("\n"), unquoted))
69+
self.assertTrue(is_form_4("A_B\n1\n2".split("\n"), unquoted))
70+
self.assertTrue(is_form_4("A&B\n1\n2".split("\n"), unquoted))
71+
self.assertTrue(is_form_4("A&B\n-1\n2".split("\n"), unquoted))
72+
self.assertTrue(is_form_4('"A"\n"B"\n"C"'.split("\n"), quoted))
7373

74-
self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"\n', quoted))
75-
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"\n', quoted))
76-
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"\n', quoted))
77-
self.assertFalse(is_form_4('A\n"-1"\n2', unquoted))
78-
self.assertFalse(is_form_4("A B\n-1 3\n2 4", unquoted))
74+
self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"'.split("\n"), quoted))
75+
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"'.split("\n"), quoted))
76+
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"'.split("\n"), quoted))
77+
self.assertFalse(is_form_4('A\n"-1"\n2'.split("\n"), unquoted))
78+
self.assertFalse(is_form_4("A B\n-1 3\n2 4".split("\n"), unquoted))
7979

8080
def test_form_5(self) -> None:
8181
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
8282

83-
self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"', dialect))
84-
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"', dialect))
83+
self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"'.split("\n"), dialect))
84+
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"'.split("\n"), dialect))
8585

86-
self.assertFalse(is_form_5("A,B\n1,2\n3,4", dialect))
87-
self.assertFalse(is_form_5("A,B\n1,\n2,3", dialect))
88-
self.assertFalse(is_form_5('"A,""B"""\n"1,"\n"2,3"', dialect))
86+
self.assertFalse(is_form_5("A,B\n1,2\n3,4".split("\n"), dialect))
87+
self.assertFalse(is_form_5("A,B\n1,\n2,3".split("\n"), dialect))
88+
self.assertFalse(
89+
is_form_5('"A,""B"""\n"1,"\n"2,3"'.split("\n"), dialect)
90+
)
8991

9092

9193
if __name__ == "__main__":

0 commit comments

Comments
 (0)